Beispiel #1
0
def makePipelineImpMultinomialNB(X_train, Y_train, X_test, Y_test):
    pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(),
                             MultinomialNB())
    pipe.fit(X_train, Y_train)
    y_pred = pipe.predict(X_test)

    print(accuracy_score(Y_test, y_pred))
    print(classification_report_imbalanced(Y_test, y_pred))
    def perform(emotion, train_tweets, y_train, task_name):
        #Select the scoring metric, depending upon the task name
        scoring = Dictionaries.scoring.get(task_name)
        if task_name == 'c':
            estimator = Dictionaries.classifier_dict
        elif task_name == 'r':
            estimator = Dictionaries.regressor_dict

        # Perform the preprocessing and feature engineering tasks
        preprocess_train_df = Preprocessor.perform(train_tweets, emotion,
                                                   'train', task_name)
        X_train = Feature_Transformer.perform(preprocess_train_df, emotion,
                                              'train', task_name)

        # Iterate through all the estimators
        for estimator_name, estimator in estimator.items():
            print(estimator_name)
            # Default pipeline contains Feature selector + estimator
            pipeline = make_pipeline(
                MinMaxScaler(feature_range=(0, 1), copy=True), estimator)

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scoring,
                                    cv=5,
                                    return_train_score=False)
            print(scores)

            # Classification task
            if (task_name == 'c'):
                Writer.write_class_feat_anal_results_in_file(
                    emotion, 'original', estimator_name, 14, scores)
                # Pipeline with resampler -SMOTE, TomekLinks, SMOTETomek
                for resampler_name, resampler in Dictionaries.resampler_dict.items(
                ):
                    # Pipeline used for resampling
                    pipeline = make_pipeline_imb(
                        MinMaxScaler(feature_range=(0, 1), copy=True),
                        resampler, estimator)

                    scores = cross_validate(pipeline,
                                            X_train,
                                            y_train,
                                            scoring=scoring,
                                            cv=5,
                                            return_train_score=False)
                    print(scores)

                    Writer.write_class_feat_anal_results_in_file(
                        emotion, resampler_name, estimator_name, 14, scores)
                    gc.collect()
            # Regression task
            elif (task_name == 'r'):
                Writer.write_reg_feat_anal_results_in_file(
                    emotion, estimator_name, 14, scores)
                gc.collect()
Beispiel #3
0
def makePipelineImpBernoulliNB(X_train, Y_train, X_test, Y_test, binarize):
    pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(),
                             BernoulliNB(binarize=binarize))

    pipe.fit(X_train, Y_train)
    y_pred = pipe.predict(X_test)

    print('binarize', binarize, accuracy_score(Y_test, y_pred))
    print(classification_report_imbalanced(Y_test, y_pred))
Beispiel #4
0
    def objective_function(params):
        classifier_type = params['type']
        del params['type']
        if classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
        elif classifier_type == 'svm':
            clf = SVC(**params)
        else:
            return 0

        pl = make_pipeline_imb(resampling, clf)

        score = cross_val_score(pl, X_train, y_train, n_jobs=args.cpus,
                                cv=3).mean()
        return {'loss': -score, 'status': STATUS_OK}
def logistic_with_smote():
    print("Start of logist with smote")
    X_train, X_test, y_train, y_test = data_processor()
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    # build model with SMOTE imblearn
    smote_pipeline = make_pipeline_imb(SMOTE(random_state=42), clf)
    smote_model = smote_pipeline.fit(X_train, y_train)
    smote_prediction = smote_model.predict(X_test)
    smote_prediction_proba = smote_model.predict_proba(X_test)[:, 1]
    print(classification_report_imbalanced(y_test, smote_prediction))
    print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))
    print("SMOTE AUC score: ", roc_auc_score(y_test, smote_prediction_proba))
    print("SMOTE F1 Score: ", f1_score(y_test, smote_prediction))
    print("End of logistic smote")
Beispiel #6
0
def evaluatemodel(classifier, name):
    # build model with SMOTE imblearn
    smote_pipeline = make_pipeline_imb(SMOTE(random_state=6), \
                                   classifier)

    smote_model = smote_pipeline.fit(X_train, y_train)
    smote_prediction = smote_model.predict(X_test)

    print("normal data distribution: {}".format(Counter(y)))
    X_smote, y_smote = SMOTE().fit_sample(X, y)
    print("SMOTE data distribution: {}".format(Counter(y_smote)))
    
    print("Confusion Matrix: ")
    #print(confusion_matrix(y_test, smote_prediction))
    plot_confusion_matrix(confusion_matrix(y_test, smote_prediction))

    print('\nSMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))

    print_results("\nSMOTE + " + name + " classification", y_test, smote_prediction)
Beispiel #7
0
def trainModel(modelName, X_train, y_train):
    if (modelName == "Decision Tree"):
        from sklearn.tree import DecisionTreeClassifier
        model = DecisionTreeClassifier(criterion='entropy',
                                       max_depth=10,
                                       random_state=0)
        return model.fit(X_train, y_train)
    if (modelName == "Neural Network"):
        from sklearn.neural_network import MLPClassifier
        mlp = MLPClassifier(hidden_layer_sizes=(24, 24, 24))
        return mlp.fit(X_train, y_train.values.ravel())
    if (modelName == "LDA"):
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        model = LinearDiscriminantAnalysis(n_components=2)
        return model.fit(X_train, y_train.values.ravel())
    if (modelName == "Support Vector Machine"):
        from sklearn import svm
        from imblearn.pipeline import make_pipeline as make_pipeline_imb
        from imblearn.over_sampling import SMOTE
        clf = svm.SVC(C=0.5, kernel='rbf', decision_function_shape='ovr')
        clf.fit(X_train, y_train)
        smote = SMOTE('all', random_state=42)
        smote_pipeline = make_pipeline_imb(smote, clf)
        return smote_pipeline.fit(X_train, y_train)
Beispiel #8
0
def ROC(classifer, name):
    # build model with SMOTE imblearn
    smote_pipeline = make_pipeline_imb(SMOTE(random_state=6), \
                                   classifier)

    smote_model = smote_pipeline.fit(X_train, y_train)
    smote_prediction = smote_model.predict(X_test) 
    X_smote, y_smote = SMOTE().fit_sample(X, y)
    print_results("\nSMOTE + " + name + " classification")

    # Compute predicted probabilities: y_pred_prob
    y_pred_prob = smote_pipeline.predict_proba(X_test)[:,1]

    # Generate ROC curve values: fpr, tpr, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    print('AUC:', auc(fpr, tpr))

    # Plot ROC curve
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
Beispiel #9
0
 def __init__(self):
     self.clf = make_pipeline_imb(
         Imputer(strategy='median'),
         RandomOverSampler(),
         RandomForestClassifier(n_estimators=126, verbose=True, min_impurity_decrease=10e-5, criterion="entropy")
         )
 def train(self, X_train, y_train):
     self.clf.fit(X_train, y_train)
     # build model with SMOTE imblearn
     self.smote_pipeline = make_pipeline_imb(SMOTE(random_state=4),
                                             self.clf)
     self.smote_model = self.smote_pipeline.fit(X_train, y_train)
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))


# our classifier to use
classifier = RandomForestClassifier

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

# build normal model
pipeline = make_pipeline(classifier(random_state=42))
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_val)

# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=4),
                                   classifier(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_val)

# build model with undersampling
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42),
                                      classifier(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_val)

# classification report
print(classification_report(y_val, prediction))
print(classification_report_imbalanced(y_val, smote_prediction))
print(classification_report_imbalanced(y_val, nearmiss_prediction))

print()
Beispiel #12
0
def predict():

    df_train = pd.read_csv('train-dataset.csv')
    df_test = pd.read_csv('hold-out.csv')
    df_test = df_test[~(df_test['comment'].isnull())]
    X_train = df_train['comment']
    X_test = df_test['comment'][:10000]

    y_train = df_train['offensive']
    y_test = df_test['offensive'][:10000]

    tokenized_train = [nltk.word_tokenize(t) for t in X_train]
    tokenized_test = [nltk.word_tokenize(t) for t in X_test]
    num_features = 256

    w2v_model = gensim.models.Word2Vec(tokenized_train,
                                       size=num_features,
                                       window=150,
                                       min_count=10,
                                       sample=1e-3,
                                       workers=16)
    w2v_model.save('w2v')
    w2v_model = gensim.models.Word2Vec.load('w2v')

    def averaged_word2vec_vectorizer(corpus, model, num_features):
        vocabulary = set(model.wv.index2word)

        def average_word_vectors(words, model, vocabulary, num_features):
            feature_vector = np.zeros((num_features), dtype='float64')
            nwords = 0

            for word in words:
                if word in vocabulary:
                    nwords += 1
                    feature_vector = np.add(feature_vector, model.wv[word])
            if nwords:
                feature_vector = np.divide(feature_vector, nwords)
            return feature_vector

        features = [
            average_word_vectors(tokenized_sentence, model, vocabulary,
                                 num_features) for tokenized_sentence in corpus
        ]

        return np.array(features)

    avg_wv_train_features = averaged_word2vec_vectorizer(
        corpus=tokenized_train, model=w2v_model, num_features=num_features)
    avg_wv_test_features = averaged_word2vec_vectorizer(
        corpus=tokenized_test, model=w2v_model, num_features=num_features)

    lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs')

    smote_w2v_pipeline = make_pipeline_imb(
        SMOTE(sampling_strategy=.95, k_neighbors=40, kind='borderline2'), lr)

    smote_w2v_model = smote_w2v_pipeline.fit(avg_wv_train_features, y_train)

    smote_w2v_predict = smote_w2v_model.predict(avg_wv_test_features)

    metrics.recall_score(y_test, smote_w2v_predict)

    if request.method == 'POST':
        message = request.form['message']
        data = [message]
        tokenized = [nltk.word_tokenize(t) for t in data]
        avg_wv_features = averaged_word2vec_vectorizer(
            corpus=tokenized, model=w2v_model, num_features=num_features)
        my_prediction = smote_w2v_model.predict(avg_wv_features)
    return render_template('result.html', prediction=my_prediction)
# In[91]:

from sklearn.ensemble import RandomForestClassifier

# In[92]:

# splitting data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

# In[93]:

# build RabdomForestClassifier model with SMOTE imblearn
rfc_pipeline = make_pipeline_imb(SMOTE(random_state=4),
                                 RandomForestClassifier(n_estimators=50))
smote_model = rfc_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)
filename = 'rfc_model.pckl'
pickle.dump(rfc_pipeline, open(filename, 'wb'))

print()
print_results("RandomForest classification", y_test, smote_prediction)
print()

# # Logistic Regression

# In[94]:

from sklearn.linear_model import LogisticRegression
Beispiel #14
0
    y_train = y_train_shift

    print("X_train shape:")
    print(X_train.shape)
    print("y_train length:")
    print(len(y_train))

    # X_train_ = X_train
    # y_train_ = y_train

    # y_train_shift_ = y_train_shift
    print("len(X_train), len(y_train):")
    print(len(X_train), len(y_train))

    pipe = make_pipeline_imb(MinMaxScaler(), RandomUnderSampler(),
                             RandomForestClassifier())
    pipe.fit(X_train, y_train)

    # prediction ##################################----------------------------

    index = KPI_ID_test.index(KPI_ID_name)

    test_manual_feature = get_manual_feature(KPI_LIST_test[index])
    test_manual_feature = fit_window(df=test_manual_feature, window=window)

    single_predict = get_single_feature(raw_df=KPI_LIST_test[index],
                                        KPI_ID_name=KPI_ID_name)
    single_predict = fit_window(single_predict, window)

    ts_KPI_ID_test = KPI_LIST_test[index].pop('KPI ID')
    ts_timestamp = KPI_LIST_test[index]['timestamp']
Beispiel #15
0
 def __init__(self):
     self.clf = make_pipeline_imb(
         Imputer(strategy='median'), RandomOverSampler(),
         LogisticRegression(C=0.010826367338740546, penalty="l2"))
# use a ``RandomUnderSampler`` to equalize the number of samples in all the
# classes before the training.
#
# Currently, imbalanced-learn does not handle sparse matrices --- we are
# currently working on bringing this feature --- and an additional transformer
# to convert the sparse to dense matrices is required in the pipeline.
#
# It is also important to note that we are using the ``make_pipeline`` function
# implemented in imbalanced-learn to properly handle the samplers.


def densify(X):
    """Function to densify an array."""
    return X.toarray()


pipe = make_pipeline_imb(TfidfVectorizer(),
                         FunctionTransformer(func=densify, accept_sparse=True),
                         RandomUnderSampler(), MultinomialNB())

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

###############################################################################
# Although the results are almost identical, it can be seen that the resampling
# allowed to correct the poor recall of the class \#3 at the cost of reducing
# the other metrics for the other classes. However, the overall results are
# slightly better.

print(classification_report_imbalanced(y_test, y_pred))
    def perform(emotion, train_tweets, y_train, task_name, estimator_dict):
        #Select the scoring metric, depending upon the task name
        scoring = Dictionaries.scoring.get(task_name)

        # Perform the preprocessing and feature engineering tasks
        preprocess_train_df = Preprocessor.perform(train_tweets, emotion,
                                                   'train', task_name)
        X_train = Feature_Transformer.perform(preprocess_train_df, emotion,
                                              'train', task_name)

        #Iterate through all the estimators
        for estimator_name, estimator in estimator_dict.items():
            #pipeline for original data
            pipeline = make_pipeline(
                MinMaxScaler(feature_range=(0, 1), copy=True),
                RFECV(estimator, step=1, cv=5, scoring=scoring, n_jobs=-1))

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scoring,
                                    cv=5,
                                    return_train_score=False)
            print(scores)
            pipeline.fit(X_train, y_train)

            print(pipeline.steps)
            #Get number of features selected, the features selected and its ranking
            selected_features = pipeline.steps[1][1].n_features_
            feature_mask = pipeline.steps[1][1].support_
            feature_rank = pipeline.steps[1][1].ranking_

            # Classification task
            if (task_name == 'c'):
                #Get F1 scores
                cv_feature_scores = pipeline.steps[1][1].grid_scores_  # f1

                Writer.write_class_feat_rank_anal_results_in_file(
                    emotion, 'original', estimator_name, selected_features,
                    feature_mask, feature_rank, cv_feature_scores)
                # Pipeline with resamplers - SMOTE, TomekLinks, SMOTETomek
                for resampler_name, resampler in Dictionaries.resampler_dict.items(
                ):
                    #pipeline for resampling
                    pipeline = make_pipeline_imb(
                        MinMaxScaler(feature_range=(0, 1), copy=True),
                        resampler,
                        RFECV(estimator,
                              step=1,
                              cv=5,
                              scoring=scoring,
                              n_jobs=-1))

                    # Fit the pipeline with data
                    pipeline.fit(X_train, y_train)

                    print(pipeline.steps)
                    selected_features = pipeline.steps[2][1].n_features_
                    feature_mask = pipeline.steps[2][1].support_
                    feature_rank = pipeline.steps[2][1].ranking_
                    cv_feature_scores = pipeline.steps[2][1].grid_scores_  # f1

                    Writer.write_class_feat_rank_anal_results_in_file(
                        emotion, resampler_name, estimator_name,
                        selected_features, feature_mask, feature_rank,
                        cv_feature_scores)
                    gc.collect()
            # Regression task
            if (task_name == 'r'):
                #Get rmse scores
                cv_feature_scores = np.sqrt(-pipeline.steps[1][1].grid_scores_
                                            )  # sqrt(-neg_mean_squared_error)

                Writer.write_reg_feat_rank_anal_results_in_file(
                    emotion, estimator_name, selected_features, feature_mask,
                    feature_rank, cv_feature_scores)
                gc.collect()
Beispiel #18
0
pipeline = make_pipeline_imb(  # Optimal
    FeatureUnion(
        transformer_list=[
            ('vect1',
             CountVectorizer(max_df=0.80,
                             min_df=8,
                             ngram_range=(1, 1),
                             stop_words=stopwords_complete_lemmatized,
                             strip_accents='unicode',
                             tokenizer=LemmaTokenizer())),  # 1-Gram Vectorizer
            ('vect2',
             CountVectorizer(max_df=0.95,
                             min_df=10,
                             ngram_range=(2, 2),
                             stop_words=None,
                             strip_accents='unicode',
                             tokenizer=LemmaTokenizer())),
        ],  # 2-Gram Vectorizer
        transformer_weights={
            'vect1': 1.0,
            'vect2': 1.0,
        },
    ),
    TfidfTransformer(use_idf=True),
    RandomUnderSampler(ratio={
        1: 19000,
        2: 27200,
        3: 20000
    }, random_state=22),
    SelectFromModel(
        estimator=LinearSVC(),
        threshold='1.2*mean'),  # Dimensionality Reduction               
    #MLPClassifier(verbose=True, hidden_layer_sizes=(200,), max_iter=200, solver='sgd', learning_rate='adaptive', learning_rate_init=0.60, momentum=0.50, alpha=1e-01),)
    MLPClassifier(verbose=True,
                  random_state=22,
                  hidden_layer_sizes=(100, ),
                  max_iter=200,
                  solver='sgd',
                  learning_rate='constant',
                  learning_rate_init=0.07,
                  momentum=0.90,
                  alpha=1e-01),
)
Beispiel #19
0
 def __init__(self):
     self.clf = make_pipeline_imb(Imputer(strategy='median'),
                                  RandomUnderSampler(),
                                  LogisticRegression(C=1e-3, penalty="l2"))
Beispiel #20
0
 def __init__(self):
     self.clf = make_pipeline_imb(
         Imputer(strategy='median'), RandomUnderSampler(),
         RandomForestClassifier(10,
                                verbose=True,
                                min_impurity_decrease=10e-5))
print(classification_report_imbalanced(y_test, y_pred))

###############################################################################
# Balancing the class before classification
###############################################################################

###############################################################################
# To improve the prediction of the class \#3, it could be interesting to apply
# a balancing before to train the naive bayes classifier. Therefore, we will
# use a ``RandomUnderSampler`` to equalize the number of samples in all the
# classes before the training.
#
# It is also important to note that we are using the ``make_pipeline`` function
# implemented in imbalanced-learn to properly handle the samplers.

pipe = make_pipeline_imb(TfidfVectorizer(),
                         RandomUnderSampler(),
                         MultinomialNB())

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

###############################################################################
# Although the results are almost identical, it can be seen that the resampling
# allowed to correct the poor recall of the class \#3 at the cost of reducing
# the other metrics for the other classes. However, the overall results are
# slightly better.

print(classification_report_imbalanced(y_test, y_pred))
Beispiel #22
0
x = data_train2.values

x = StandardScaler().fit_transform(x)

from sklearn.decomposition import PCA
pca = PCA(n_components=254)
principalComponents = pca.fit_transform(x)
var= pca.explained_variance_ratio_

#Cumulative Variance explains
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

plt.plot(var1)

# consider the top 150 components as it cross >80% vriance
-----------------------------------------------------

from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
classifier = RandomForestClassifier

smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), classifier(min_samples_split=25,n_estimators=700,random_state=42))
smote_model = smote_pipeline.fit(train_features, train_labels)
smote_prediction = smote_model.predict(test_features)
confusion_matrix(test_labels,smote_prediction)



#oversampler=SMOTE(random_state=0)
#os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
Beispiel #23
0
    def perform(emotion, train_tweets, y_train, task_name, k, estimator_dict,
                vectorizer_dict):
        parent_dir = Path.cwd().parent
        pipelines_dir = parent_dir.joinpath('new_results',
                                            'pipelines_' + emotion)
        Writer.check_for_directory(pipelines_dir)

        #Select the scoring metric, depending upon the task name
        scoring = Dictionaries.scoring.get(task_name)

        # Perform the preprocessing and feature engineering tasks
        preprocess_train_df = Preprocessor.perform(train_tweets, emotion,
                                                   'train', task_name)
        trans_feat_train_df = Feature_Transformer.perform(
            preprocess_train_df, emotion, 'train', task_name)

        #Iterate through all the vectorizers
        for vect_name, vectorizer in vectorizer_dict.items():
            # Convert the preprocessed text into feature vectors using vectorizer
            train_vect = vectorizer.fit_transform(
                preprocess_train_df['preprocessed_text'].values)
            train_vect_df = pd.DataFrame(
                train_vect.toarray(), columns=vectorizer.get_feature_names())
            print('TRAIN_VECTORIZED')
            print(train_vect_df.shape)

            # Final training data: Merge Feature vector columns with transformed features columns -> X_train, X_test
            X_train = pd.concat([train_vect_df, trans_feat_train_df], axis=1)
            print(
                'X_train, y_train with vector features + features transformed')
            print(X_train.shape, y_train.shape)

            #Iterate through all the estimators
            for estimator_name, estimator in estimator_dict.items():
                ########################### CLASSIFICATION ##################################
                if (task_name == 'c'):

                    # Default pipeline contains Feature selector + estimator, where as if k = 0(all_in), the pipeline doesnot contain the Feature selector
                    pipeline = make_pipeline(
                        MinMaxScaler(feature_range=(0, 1), copy=True),
                        SelectKBest(chi2, k=k), estimator)
                    if k == 0:
                        pipeline = make_pipeline(
                            MinMaxScaler(feature_range=(0, 1), copy=True),
                            estimator)

                    scores = cross_validate(pipeline,
                                            X_train,
                                            y_train,
                                            scoring=scoring,
                                            cv=5,
                                            return_train_score=False)
                    print(scores)

                    # Fit the same pipeline for train data and predict the results for X_test to find the test scores
                    pipeline.fit(X_train, y_train)
                    # Store the pipeline as pickle files
                    with open(
                            pipelines_dir.joinpath('class_model_anal_' +
                                                   emotion + '_original_' +
                                                   estimator_name + '_' +
                                                   vect_name + '_' + str(k) +
                                                   '.pkl'), 'wb') as infile:
                        pick.dump(pipeline, infile, pick.HIGHEST_PROTOCOL)
                        infile.close()

                    Writer.write_class_model_anal_results_in_file(
                        emotion, 'original', estimator_name, vect_name, k,
                        scores)

                    ##################################### CLASSIFICATION RESAMPLING ###################################################################

                    # Pipeline with resampler -SMOTE, TomekLinks, SMOTETomek
                    for resampler_name, resampler in Dictionaries.resampler_dict.items(
                    ):
                        print(estimator_name, vect_name, resampler_name)

                        pipeline = make_pipeline_imb(
                            MinMaxScaler(feature_range=(0, 1), copy=True),
                            SelectKBest(chi2, k=k), resampler, estimator)
                        if k == 0:
                            pipeline = make_pipeline_imb(
                                MinMaxScaler(feature_range=(0, 1), copy=True),
                                resampler, estimator)

                        scores = cross_validate(pipeline,
                                                X_train,
                                                y_train,
                                                scoring=scoring,
                                                cv=5,
                                                return_train_score=False)
                        print(scores)

                        # Fit the same pipeline for train data and predict the results for fixed X_test to find the test scores
                        pipeline.fit(X_train, y_train)
                        # Store the pipeline as pickle files
                        with open(
                                pipelines_dir.joinpath('class_model_anal_' +
                                                       emotion + '_' +
                                                       resampler_name + '_' +
                                                       estimator_name + '_' +
                                                       vect_name + '_' +
                                                       str(k) + '.pkl'),
                                'wb') as infile:
                            pick.dump(pipeline, infile, pick.HIGHEST_PROTOCOL)
                            infile.close()

                        Writer.write_class_model_anal_results_in_file(
                            emotion, resampler_name, estimator_name, vect_name,
                            k, scores)
                        gc.collect()

                ######################## REGRESSION #############################################
                elif (task_name == 'r'):
                    # Default pipeline contains Feature selector + estimator, where as if k = 0, the pipeline doesnot contain the Feature selector
                    pipeline = make_pipeline(
                        MinMaxScaler(feature_range=(0, 1), copy=True),
                        SelectKBest(f_regression, k=k), estimator)
                    if k == 0:
                        pipeline = make_pipeline(
                            MinMaxScaler(feature_range=(0, 1), copy=True),
                            estimator)

                    scores = cross_validate(pipeline,
                                            X_train,
                                            y_train,
                                            scoring=scoring,
                                            cv=5,
                                            return_train_score=False)
                    print(scores)

                    # Fit the same pipeline for train data and predict the results for fixed X_test to find the test scores
                    pipeline.fit(X_train, y_train)
                    # Store the pipeline as pickle files
                    with open(
                            pipelines_dir.joinpath('reg_model_anal_' +
                                                   emotion + '_original_' +
                                                   estimator_name + '_' +
                                                   vect_name + '_' + str(k) +
                                                   '.pkl'), 'wb') as infile:
                        pick.dump(pipeline, infile, pick.HIGHEST_PROTOCOL)
                        infile.close()

                    Writer.write_reg_model_anal_results_in_file(
                        emotion, estimator_name, vect_name, k, scores)
                    gc.collect()
Beispiel #24
0
def train_the_best_models_again(model_properties):
    """
    A method to train the classification best models using original and resampled dataset again
    """
    parent_dir = Path.cwd().parent
    pickle_dir = parent_dir.joinpath('default_results',
                                     'pickle_files_feat_eng')
    results_dir = parent_dir.joinpath('default_results', 'score_files')
    best_scores = {}
    scores_dict = {}
    for i, emotion in Dictionaries.emo_dict.items():
        best_model_original = model_properties[emotion + '_class_original']
        best_model_resampled = model_properties[emotion + '_class_resampled']

        #Fit transform the vectorizer with the corresponding preprocessed training data
        if os.path.exists(
                pickle_dir.joinpath(emotion + '_c_train_preprocess_df.pkl')):
            preprocess_train_df = pd.read_pickle(
                pickle_dir.joinpath(emotion + '_c_train_preprocess_df.pkl'))
            trans_feat_train_df = pd.read_pickle(
                pickle_dir.joinpath(emotion +
                                    '_c_train_feat_transform_df.pkl'))

            #Use the corresponding vectorizer from the model properties to vectorize
            train_vect_original = Dictionaries.vectorizer_dict[
                best_model_original[2]].fit_transform(
                    preprocess_train_df['preprocessed_text'].values)
            train_vect_df_original = pd.DataFrame(
                train_vect_original.toarray(),
                columns=Dictionaries.vectorizer_dict[
                    best_model_original[2]].get_feature_names())
            train_vect_resampled = Dictionaries.vectorizer_dict[
                best_model_resampled[2]].fit_transform(
                    preprocess_train_df['preprocessed_text'].values)
            train_vect_df_resampled = pd.DataFrame(
                train_vect_resampled.toarray(),
                columns=Dictionaries.vectorizer_dict[
                    best_model_resampled[2]].get_feature_names())

            #merge vectorized features and transformed features
            X_train_original = pd.DataFrame(
                pd.concat([train_vect_df_original, trans_feat_train_df],
                          axis=1))
            X_train_resampled = pd.DataFrame(
                pd.concat([train_vect_df_resampled, trans_feat_train_df],
                          axis=1))
            y_train = preprocess_train_df['Affect Dimension'].astype(
                'category').cat.rename_categories({
                    emotion: 1,
                    'other': 0
                })

            # pipeline for original dataset
            pipeline = make_pipeline(
                MinMaxScaler(feature_range=(0, 1), copy=True),
                SelectKBest(chi2, k=int(best_model_original[3])),
                Dictionaries.classifier_dict[best_model_original[1]])
            if best_model_original[3] == 0:
                pipeline = make_pipeline(
                    MinMaxScaler(feature_range=(0, 1), copy=True),
                    Dictionaries.classifier_dict[best_model_original[1]])

            y_pred_original = cross_val_predict(pipeline,
                                                X_train_original,
                                                y_train,
                                                cv=5)

            # pipeline for resampled dataset
            pipeline = make_pipeline_imb(
                MinMaxScaler(feature_range=(0, 1), copy=True),
                SelectKBest(chi2, k=int(best_model_resampled[3])),
                Dictionaries.resampler_dict[best_model_resampled[0]],
                Dictionaries.classifier_dict[best_model_resampled[1]])
            if best_model_resampled[3] == 0:
                pipeline = make_pipeline_imb(
                    MinMaxScaler(feature_range=(0, 1), copy=True),
                    Dictionaries.resampler_dict[best_model_resampled[0]],
                    Dictionaries.classifier_dict[best_model_resampled[1]])

            y_pred_resampled = cross_val_predict(pipeline,
                                                 X_train_resampled,
                                                 y_train,
                                                 cv=5)

            scores_original = classification_report(y_train,
                                                    y_pred_original,
                                                    labels=[1, 0],
                                                    output_dict=True)
            accuracy_original = accuracy_score(y_train, y_pred_original)
            scores_resampled = classification_report(y_train,
                                                     y_pred_resampled,
                                                     labels=[1, 0],
                                                     output_dict=True)
            accuracy_resampled = accuracy_score(y_train, y_pred_resampled)

            print(scores_original, scores_resampled)
            scores_dict[emotion +
                        'original'] = [scores_original, accuracy_original]
            scores_dict[emotion +
                        'resampled'] = [scores_resampled, accuracy_resampled]
            emo_f1_original = scores_original['1']['f1-score']
            avg_f1_original = scores_original['macro avg']['f1-score']
            emo_f1_resampled = scores_resampled['1']['f1-score']
            avg_f1_resampled = scores_resampled['macro avg']['f1-score']

            #Add the results needed for analysis to the dict
            best_scores[emotion] = [
                avg_f1_original, emo_f1_original, accuracy_original,
                avg_f1_resampled, emo_f1_resampled, accuracy_resampled
            ]
        else:
            #If the file doesnt exist, exit the program with instructions
            print(
                '\nRequired files does not exist.\n\n Please, train the models first by running > Modelling.py and add the files created in \'default_results\' folder'
            )
            sys.exit(1)

    # store the classification report and accuracy of both the models
    with open(results_dir.joinpath('best_class_both_model_scores.pkl'),
              'wb') as outfile:
        pickle.dump(scores_dict, outfile)

    return best_scores
Beispiel #25
0
#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier

#from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix, precision_recall_curve, accuracy_score







classifier = RandomForestClassifier

# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), \
                                   classifier(random_state=42))

smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

#Showing the diference before and after the transformation used
print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))

print("Confusion Matrix: ")
print(confusion_matrix(y_test, smote_prediction))

print('\nSMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))

import sklearn.metrics as met
####################### imbalance learn ####################################
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced
imb_run = False
if imb_run:
    print('****************** imbalance learn ****************')

    clf = RandomForestClassifier()

    print('************** RandomUnderSampler ***********')
    pipe = make_pipeline_imb(vect, RandomUnderSampler(random_state=777), clf)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    preds_train = pipe.predict(X_train)
    print(classification_report_imbalanced(y_val, preds))

    print('************** RandomOverSampler ***********')
    pipe = make_pipeline_imb(vect, RandomOverSampler(random_state=777), clf)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    preds_train = pipe.predict(X_train)
    print(classification_report_imbalanced(y_val, preds))

    print('************** SMOTEENN(combine) ***********')
    pipe = make_pipeline_imb(vect, SMOTEENN(random_state=42), clf)
    pipe.fit(X_train, y_train)
Beispiel #27
0
# Neural Network
evaluatemodel(MLPClassifier(random_state=2), "MLP")

# Random Forest
def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f2: {}".format(fbeta_score(true_value, pred, beta=2)))

# splitting data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4, test_size=0.20)

# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=6), RandomForestClassifier(random_state=2))

smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))
    
print("Confusion Matrix: ")
print(confusion_matrix(y_test, smote_prediction))
#plot_confusion_matrix(confusion_matrix(y_test, smote_prediction))

print('\nSMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))

print_results("\nSMOTE + RandomForest classification", y_test, smote_prediction)
Beispiel #28
0
# Balancing the class before classification
# -----------------------------------------
#
# To improve the prediction of the class \#3, it could be interesting to apply
# a balancing before to train the naive bayes classifier. Therefore, we will
# use a :class:`~imblearn.under_sampling.RandomUnderSampler` to equalize the
# number of samples in all the classes before the training.
#
# It is also important to note that we are using the
# :class:`~imblearn.pipeline.make_pipeline` function implemented in
# imbalanced-learn to properly handle the samplers.

# %%
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb

model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(),
                          MultinomialNB())

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# %% [markdown]
# Although the results are almost identical, it can be seen that the resampling
# allowed to correct the poor recall of the class \#3 at the cost of reducing
# the other metrics for the other classes. However, the overall results are
# slightly better.

# %%
print(classification_report_imbalanced(y_test, y_pred))
Beispiel #29
0
                                                   df_working['N-S'].values,
                                                   test_size=0.30, random_state=42)
# # Thinking - Feeling
X_train_TF, X_test_TF, y_train_TF, y_test_TF = train_test_split(df_working['posts'].values,
                                                   df_working['T-F'].values,
                                                   test_size=0.30, random_state=42)
# # Judging - Perceiving
X_train_JP, X_test_JP, y_train_JP, y_test_JP = train_test_split(df_working['posts'].values,
                                                   df_working['J-P'].values,
                                                   test_size=0.30, random_state=42)

# setting up model 


pipe = make_pipeline_imb(TfidfVectorizer(ngram_range=(1,2),norm='l1',max_features=100),
                         RandomUnderSampler(random_state=420),
                         RandomForestClassifier(min_samples_leaf=1, min_samples_split=6, n_estimators=120, 
                             criterion='gini', bootstrap='False', n_jobs= -1))
							 

# training model 


pipe.fit(X_train_JP, y_train_JP)
y_pred = pipe.predict(X_test_JP)
probablity=pipe.predict_proba(X_test_JP)
# Model Accuracy
print("Random forest Accuracy:", accuracy_score(y_test_JP, y_pred))
print(classification_report_imbalanced(y_test_JP, y_pred))

# pickle_out = open("model_f.pickle","wb")
# pickle.dump(pipe, pickle_out)
print('Total time - Without Undersampling: ', end - start, ' seconds\n')
print(metrics.classification_report(y_validation, validation_result))
print()
print('Without Undersampling -  Pipeline Score {}'.format(multiC.fit(X_train, y_train).score(X_validation, y_validation)))
print()
print_results("Without Undersampling - Validation set: ", true_validation, validation_result)

print('===============================Without Undersampling Ends===============================\n')

print('================================With Undersampling Starts===============================\n')

start = time.time()

# build model with undersampling
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=0), multiC)
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_validation)

# Print the distribution of labels about both models
print()
print("Without Undersampling - data distribution: {}".format(Counter(y_train)))
X_nearmiss, y_nearmiss = NearMiss(random_state = 0).fit_sample(X_train, y_train)
print("With Undersampling - data distribution: {}".format(Counter(y_nearmiss)))
print()

end = time.time()

# Here comes the result with Undersampling
print('Total time - With Undersampling: ', end - start, ' seconds\n')
print(classification_report_imbalanced(y_validation, nearmiss_prediction))
def cross_validate(model_name, X, y):

    # y = y.reset_index()
    y = y.as_matrix()

    kf = KFold(n_splits=5, random_state=42)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []

    if debug:
        print("in cross_validate: 1, size of X, y: ", len(X), len(y))
        print("y type is: ", type(y))

    if 'svc' in model_name.lower():
        classifier = SVC(kernel='linear', probability=True)

    elif 'random forest' in model_name.lower() or 'rf' in model_name.lower():
        classifier = RandomForestClassifier(n_estimators=200)

    elif 'logistic regression' in model_name.lower(
    ) or 'lr' in model_name.lower():
        classifier = LogisticRegression()

    try:

        for train_indices, test_indices in kf.split(X):

            X_train, X_test = X[train_indices], X[test_indices]
            y_train, y_test = y[train_indices], y[test_indices]
            # y_train, y_test = y[1293:6460], y[0:1292]

            if 'smote' in model_name.lower():
                pipeline = make_pipeline_imb(SMOTE(), classifier)
            else:
                pipeline = make_pipeline_imb(RandomOverSampler(random_state=0),
                                             classifier)

            if debug:
                print("in cross_validate: 2, size of X_train, y_train: ",
                      len(X_train), len(y_train))
            #     print("train size, test size: ", len(
            #         train_indices), len(test_indices))
            model = pipeline.fit(X_train, y_train)
            # if debug:
            #     print("pipeline returns: ", pipeline.transform(X_train))
            prediction = model.predict(X_test)

            accuracy.append(pipeline.score(X_test, y_test))
            precision.append(precision_score(y_test, prediction, average=None))
            recall.append(recall_score(y_test, prediction, average=None))
            f1.append(f1_score(y_test, prediction, average=None))

    except Exception as e:
        print("error in k-fold validate: ", e)
        print("X[train] is: ", X_train)
        print("Y[train] is: ", y_train)

    print(f"k-fold accuracy: {accuracy}")
    print(f"k-fold recall: {recall}")
    print(f"k-fold precision: {precision}")
    print(f"k-fold f1: {f1}")
    return accuracy, precision, recall, f1