Beispiel #1
0
def train_SVC(filePath):
    '''TRAINING'''
    train_df = pandas.read_csv(filePath, sep='\t')
    # train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()
    train_data = model_utils.apply_aspdep_weight(train_df, 0.7)
    print train_data[0]
    text_clf = SVC(C=0.2,
                   cache_size=200,
                   class_weight=None,
                   coef0=0.0,
                   decision_function_shape='ovr',
                   degree=3,
                   gamma=0.5,
                   kernel='poly',
                   max_iter=-1,
                   probability=False,
                   random_state=None,
                   shrinking=True,
                   tol=0.001,
                   verbose=False).fit(train_data, train_class)
    print(set(text_clf.predict(train_data)))
    print(set(train_class))
    joblib.dump(text_clf, 'model_dumps/SVC_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
Beispiel #2
0
def train_StackedGeneralizer(filePath):
    """TRAINING"""
    train_df = pandas.read_csv(filePath, sep='\t')
    #     train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()
    train_data_1 = model_utils.apply_aspdep_weight(train_df, 1.7)
    #     base_models = [MultinomialNB(alpha=0.6, fit_prior=True, class_prior=None), BernoulliNB(alpha=1.2, fit_prior=True, class_prior=None),
    #                    linear_model.SGDClassifier(loss='squared_loss', penalty='l2', alpha=1e-3, random_state=607,
    #                                               max_iter=1000000, tol=1e-2)]

    base_models = [
        joblib.load('Multinomial_nb_model.pkl'),
        joblib.load('Bernoulli_nb_model.pkl'),
        joblib.load('SGD_model.pkl'),
        joblib.load('RF_model.pkl')
    ]

    # define blending model
    blending_model = LogisticRegression(random_state=607)

    # initialize multi-stage model
    sg = StackedGeneralizer(base_models,
                            blending_model,
                            n_folds=10,
                            verbose=False)
    sg.fit(train_data_1, train_class)
    #     joblib.dump(sg, 'Stacked_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(sg,
                                                      train_data_1,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)  #Accuracy:  0.7497418660799471 Weights:  1.7
Beispiel #3
0
def train_SGD(filePath):
    '''TRAINING'''
    train_df = pandas.read_csv(filePath, sep='\t')
    #     train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()

    train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.5)

    text_clf = linear_model.SGDClassifier(
        loss='squared_loss',
        penalty='l2',
        alpha=1e-3,
        random_state=607,
        max_iter=1000000,
        tol=1e-2).fit(train_data_1,
                      train_class)  #Accuracy:  0.7797260574839455 @ 2.1 weight
    # Accuracy:  0.7428003692958715 Weights:  0.5

    joblib.dump(text_clf, 'SGD_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data_1,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
Beispiel #4
0
def train_polarity_clf(filePath):
    train_df = pandas.read_csv(filePath, sep='\t')
    train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()
    train_data = train_df['opin_polarity'].as_matrix()
    print train_data
    # text_clf = BernoulliNB(alpha=1.0, fit_prior=True, class_prior=None).fit(train_data, train_class)
    text_clf = LogisticRegression(random_state=0).fit(train_data)
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
Beispiel #5
0
def train_ET(filePath):
    '''TRAINING'''
    train_df = pandas.read_csv(filePath, sep='\t')
    #     train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()
    train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.3)
    text_clf = ExtraTreesClassifier(n_estimators=10,
                                    max_depth=2,
                                    random_state=0,
                                    n_jobs=-1).fit(train_data_1, train_class)
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data_1,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)
Beispiel #6
0
def train_BernoulliNB(filePath):
    '''TRAINING'''
    train_df = pandas.read_csv(filePath, sep='\t')
    #     train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()

    train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.0)
    text_clf = BernoulliNB(alpha=0.6, fit_prior=True, class_prior=None).fit(
        train_data_1, train_class
    )  # 1.2 Accuracy:  0.7047700662655685 Weights:  0.0 Alpha 0.6

    joblib.dump(text_clf, 'Bernoulli_nb_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data_1,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
Beispiel #7
0
def train_MultinomialNB(filePath):
    '''TRAINING'''
    train_df = pandas.read_csv(filePath, sep='\t')
    #     train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()

    train_data_1 = model_utils.apply_aspdep_weight(train_df, 0.9)
    text_clf = MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None).fit(
        train_data_1,
        train_class)  # 1.2, 0.01   Accuracy:  0.7407742366772097 Weights:  1.2

    joblib.dump(text_clf, 'Multinomial_nb_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data_1,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
Beispiel #8
0
def train_RF(filePath):
    '''TRAINING'''
    train_df = pandas.read_csv(filePath, sep='\t')
    #     train_df = model_utils.oversample_neutral_class(train_df)
    train_class = train_df[' class'].as_matrix()

    train_data_1 = model_utils.apply_aspdep_weight(train_df, 1.1)
    text_clf = RandomForestClassifier(n_estimators=300,
                                      max_depth=180,
                                      random_state=607,
                                      n_jobs=-1).fit(train_data_1, train_class)
    joblib.dump(text_clf, 'RF_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = model_utils.get_cv_metrics(text_clf,
                                                      train_data_1,
                                                      train_class,
                                                      k_split=10)
    print("Accuracy: ", accuracy)

    print(clf_report)