def makePipelineImpMultinomialNB(X_train, Y_train, X_test, Y_test): pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB()) pipe.fit(X_train, Y_train) y_pred = pipe.predict(X_test) print(accuracy_score(Y_test, y_pred)) print(classification_report_imbalanced(Y_test, y_pred))
def perform(emotion, train_tweets, y_train, task_name): #Select the scoring metric, depending upon the task name scoring = Dictionaries.scoring.get(task_name) if task_name == 'c': estimator = Dictionaries.classifier_dict elif task_name == 'r': estimator = Dictionaries.regressor_dict # Perform the preprocessing and feature engineering tasks preprocess_train_df = Preprocessor.perform(train_tweets, emotion, 'train', task_name) X_train = Feature_Transformer.perform(preprocess_train_df, emotion, 'train', task_name) # Iterate through all the estimators for estimator_name, estimator in estimator.items(): print(estimator_name) # Default pipeline contains Feature selector + estimator pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), estimator) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) # Classification task if (task_name == 'c'): Writer.write_class_feat_anal_results_in_file( emotion, 'original', estimator_name, 14, scores) # Pipeline with resampler -SMOTE, TomekLinks, SMOTETomek for resampler_name, resampler in Dictionaries.resampler_dict.items( ): # Pipeline used for resampling pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), resampler, estimator) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) Writer.write_class_feat_anal_results_in_file( emotion, resampler_name, estimator_name, 14, scores) gc.collect() # Regression task elif (task_name == 'r'): Writer.write_reg_feat_anal_results_in_file( emotion, estimator_name, 14, scores) gc.collect()
def makePipelineImpBernoulliNB(X_train, Y_train, X_test, Y_test, binarize): pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), BernoulliNB(binarize=binarize)) pipe.fit(X_train, Y_train) y_pred = pipe.predict(X_test) print('binarize', binarize, accuracy_score(Y_test, y_pred)) print(classification_report_imbalanced(Y_test, y_pred))
def objective_function(params): classifier_type = params['type'] del params['type'] if classifier_type == 'rf': clf = RandomForestClassifier(**params) elif classifier_type == 'svm': clf = SVC(**params) else: return 0 pl = make_pipeline_imb(resampling, clf) score = cross_val_score(pl, X_train, y_train, n_jobs=args.cpus, cv=3).mean() return {'loss': -score, 'status': STATUS_OK}
def logistic_with_smote(): print("Start of logist with smote") X_train, X_test, y_train, y_test = data_processor() clf = LogisticRegression() clf.fit(X_train, y_train) # build model with SMOTE imblearn smote_pipeline = make_pipeline_imb(SMOTE(random_state=42), clf) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test) smote_prediction_proba = smote_model.predict_proba(X_test)[:, 1] print(classification_report_imbalanced(y_test, smote_prediction)) print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test))) print("SMOTE AUC score: ", roc_auc_score(y_test, smote_prediction_proba)) print("SMOTE F1 Score: ", f1_score(y_test, smote_prediction)) print("End of logistic smote")
def evaluatemodel(classifier, name): # build model with SMOTE imblearn smote_pipeline = make_pipeline_imb(SMOTE(random_state=6), \ classifier) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test) print("normal data distribution: {}".format(Counter(y))) X_smote, y_smote = SMOTE().fit_sample(X, y) print("SMOTE data distribution: {}".format(Counter(y_smote))) print("Confusion Matrix: ") #print(confusion_matrix(y_test, smote_prediction)) plot_confusion_matrix(confusion_matrix(y_test, smote_prediction)) print('\nSMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test))) print_results("\nSMOTE + " + name + " classification", y_test, smote_prediction)
def trainModel(modelName, X_train, y_train): if (modelName == "Decision Tree"): from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0) return model.fit(X_train, y_train) if (modelName == "Neural Network"): from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(hidden_layer_sizes=(24, 24, 24)) return mlp.fit(X_train, y_train.values.ravel()) if (modelName == "LDA"): from sklearn.discriminant_analysis import LinearDiscriminantAnalysis model = LinearDiscriminantAnalysis(n_components=2) return model.fit(X_train, y_train.values.ravel()) if (modelName == "Support Vector Machine"): from sklearn import svm from imblearn.pipeline import make_pipeline as make_pipeline_imb from imblearn.over_sampling import SMOTE clf = svm.SVC(C=0.5, kernel='rbf', decision_function_shape='ovr') clf.fit(X_train, y_train) smote = SMOTE('all', random_state=42) smote_pipeline = make_pipeline_imb(smote, clf) return smote_pipeline.fit(X_train, y_train)
def ROC(classifer, name): # build model with SMOTE imblearn smote_pipeline = make_pipeline_imb(SMOTE(random_state=6), \ classifier) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test) X_smote, y_smote = SMOTE().fit_sample(X, y) print_results("\nSMOTE + " + name + " classification") # Compute predicted probabilities: y_pred_prob y_pred_prob = smote_pipeline.predict_proba(X_test)[:,1] # Generate ROC curve values: fpr, tpr, thresholds fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) print('AUC:', auc(fpr, tpr)) # Plot ROC curve plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.show()
def __init__(self): self.clf = make_pipeline_imb( Imputer(strategy='median'), RandomOverSampler(), RandomForestClassifier(n_estimators=126, verbose=True, min_impurity_decrease=10e-5, criterion="entropy") )
def train(self, X_train, y_train): self.clf.fit(X_train, y_train) # build model with SMOTE imblearn self.smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), self.clf) self.smote_model = self.smote_pipeline.fit(X_train, y_train)
print("recall: {}".format(recall_score(true_value, pred))) print("f1: {}".format(f1_score(true_value, pred))) # our classifier to use classifier = RandomForestClassifier X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42) # build normal model pipeline = make_pipeline(classifier(random_state=42)) model = pipeline.fit(X_train, y_train) prediction = model.predict(X_val) # build model with SMOTE imblearn smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), classifier(random_state=42)) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_val) # build model with undersampling nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier(random_state=42)) nearmiss_model = nearmiss_pipeline.fit(X_train, y_train) nearmiss_prediction = nearmiss_model.predict(X_val) # classification report print(classification_report(y_val, prediction)) print(classification_report_imbalanced(y_val, smote_prediction)) print(classification_report_imbalanced(y_val, nearmiss_prediction)) print()
def predict(): df_train = pd.read_csv('train-dataset.csv') df_test = pd.read_csv('hold-out.csv') df_test = df_test[~(df_test['comment'].isnull())] X_train = df_train['comment'] X_test = df_test['comment'][:10000] y_train = df_train['offensive'] y_test = df_test['offensive'][:10000] tokenized_train = [nltk.word_tokenize(t) for t in X_train] tokenized_test = [nltk.word_tokenize(t) for t in X_test] num_features = 256 w2v_model = gensim.models.Word2Vec(tokenized_train, size=num_features, window=150, min_count=10, sample=1e-3, workers=16) w2v_model.save('w2v') w2v_model = gensim.models.Word2Vec.load('w2v') def averaged_word2vec_vectorizer(corpus, model, num_features): vocabulary = set(model.wv.index2word) def average_word_vectors(words, model, vocabulary, num_features): feature_vector = np.zeros((num_features), dtype='float64') nwords = 0 for word in words: if word in vocabulary: nwords += 1 feature_vector = np.add(feature_vector, model.wv[word]) if nwords: feature_vector = np.divide(feature_vector, nwords) return feature_vector features = [ average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus ] return np.array(features) avg_wv_train_features = averaged_word2vec_vectorizer( corpus=tokenized_train, model=w2v_model, num_features=num_features) avg_wv_test_features = averaged_word2vec_vectorizer( corpus=tokenized_test, model=w2v_model, num_features=num_features) lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs') smote_w2v_pipeline = make_pipeline_imb( SMOTE(sampling_strategy=.95, k_neighbors=40, kind='borderline2'), lr) smote_w2v_model = smote_w2v_pipeline.fit(avg_wv_train_features, y_train) smote_w2v_predict = smote_w2v_model.predict(avg_wv_test_features) metrics.recall_score(y_test, smote_w2v_predict) if request.method == 'POST': message = request.form['message'] data = [message] tokenized = [nltk.word_tokenize(t) for t in data] avg_wv_features = averaged_word2vec_vectorizer( corpus=tokenized, model=w2v_model, num_features=num_features) my_prediction = smote_w2v_model.predict(avg_wv_features) return render_template('result.html', prediction=my_prediction)
# In[91]: from sklearn.ensemble import RandomForestClassifier # In[92]: # splitting data into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2) scaler = preprocessing.MinMaxScaler() X = scaler.fit_transform(X) # In[93]: # build RabdomForestClassifier model with SMOTE imblearn rfc_pipeline = make_pipeline_imb(SMOTE(random_state=4), RandomForestClassifier(n_estimators=50)) smote_model = rfc_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test) filename = 'rfc_model.pckl' pickle.dump(rfc_pipeline, open(filename, 'wb')) print() print_results("RandomForest classification", y_test, smote_prediction) print() # # Logistic Regression # In[94]: from sklearn.linear_model import LogisticRegression
y_train = y_train_shift print("X_train shape:") print(X_train.shape) print("y_train length:") print(len(y_train)) # X_train_ = X_train # y_train_ = y_train # y_train_shift_ = y_train_shift print("len(X_train), len(y_train):") print(len(X_train), len(y_train)) pipe = make_pipeline_imb(MinMaxScaler(), RandomUnderSampler(), RandomForestClassifier()) pipe.fit(X_train, y_train) # prediction ##################################---------------------------- index = KPI_ID_test.index(KPI_ID_name) test_manual_feature = get_manual_feature(KPI_LIST_test[index]) test_manual_feature = fit_window(df=test_manual_feature, window=window) single_predict = get_single_feature(raw_df=KPI_LIST_test[index], KPI_ID_name=KPI_ID_name) single_predict = fit_window(single_predict, window) ts_KPI_ID_test = KPI_LIST_test[index].pop('KPI ID') ts_timestamp = KPI_LIST_test[index]['timestamp']
def __init__(self): self.clf = make_pipeline_imb( Imputer(strategy='median'), RandomOverSampler(), LogisticRegression(C=0.010826367338740546, penalty="l2"))
# use a ``RandomUnderSampler`` to equalize the number of samples in all the # classes before the training. # # Currently, imbalanced-learn does not handle sparse matrices --- we are # currently working on bringing this feature --- and an additional transformer # to convert the sparse to dense matrices is required in the pipeline. # # It is also important to note that we are using the ``make_pipeline`` function # implemented in imbalanced-learn to properly handle the samplers. def densify(X): """Function to densify an array.""" return X.toarray() pipe = make_pipeline_imb(TfidfVectorizer(), FunctionTransformer(func=densify, accept_sparse=True), RandomUnderSampler(), MultinomialNB()) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) ############################################################################### # Although the results are almost identical, it can be seen that the resampling # allowed to correct the poor recall of the class \#3 at the cost of reducing # the other metrics for the other classes. However, the overall results are # slightly better. print(classification_report_imbalanced(y_test, y_pred))
def perform(emotion, train_tweets, y_train, task_name, estimator_dict): #Select the scoring metric, depending upon the task name scoring = Dictionaries.scoring.get(task_name) # Perform the preprocessing and feature engineering tasks preprocess_train_df = Preprocessor.perform(train_tweets, emotion, 'train', task_name) X_train = Feature_Transformer.perform(preprocess_train_df, emotion, 'train', task_name) #Iterate through all the estimators for estimator_name, estimator in estimator_dict.items(): #pipeline for original data pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), RFECV(estimator, step=1, cv=5, scoring=scoring, n_jobs=-1)) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) pipeline.fit(X_train, y_train) print(pipeline.steps) #Get number of features selected, the features selected and its ranking selected_features = pipeline.steps[1][1].n_features_ feature_mask = pipeline.steps[1][1].support_ feature_rank = pipeline.steps[1][1].ranking_ # Classification task if (task_name == 'c'): #Get F1 scores cv_feature_scores = pipeline.steps[1][1].grid_scores_ # f1 Writer.write_class_feat_rank_anal_results_in_file( emotion, 'original', estimator_name, selected_features, feature_mask, feature_rank, cv_feature_scores) # Pipeline with resamplers - SMOTE, TomekLinks, SMOTETomek for resampler_name, resampler in Dictionaries.resampler_dict.items( ): #pipeline for resampling pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), resampler, RFECV(estimator, step=1, cv=5, scoring=scoring, n_jobs=-1)) # Fit the pipeline with data pipeline.fit(X_train, y_train) print(pipeline.steps) selected_features = pipeline.steps[2][1].n_features_ feature_mask = pipeline.steps[2][1].support_ feature_rank = pipeline.steps[2][1].ranking_ cv_feature_scores = pipeline.steps[2][1].grid_scores_ # f1 Writer.write_class_feat_rank_anal_results_in_file( emotion, resampler_name, estimator_name, selected_features, feature_mask, feature_rank, cv_feature_scores) gc.collect() # Regression task if (task_name == 'r'): #Get rmse scores cv_feature_scores = np.sqrt(-pipeline.steps[1][1].grid_scores_ ) # sqrt(-neg_mean_squared_error) Writer.write_reg_feat_rank_anal_results_in_file( emotion, estimator_name, selected_features, feature_mask, feature_rank, cv_feature_scores) gc.collect()
pipeline = make_pipeline_imb( # Optimal FeatureUnion( transformer_list=[ ('vect1', CountVectorizer(max_df=0.80, min_df=8, ngram_range=(1, 1), stop_words=stopwords_complete_lemmatized, strip_accents='unicode', tokenizer=LemmaTokenizer())), # 1-Gram Vectorizer ('vect2', CountVectorizer(max_df=0.95, min_df=10, ngram_range=(2, 2), stop_words=None, strip_accents='unicode', tokenizer=LemmaTokenizer())), ], # 2-Gram Vectorizer transformer_weights={ 'vect1': 1.0, 'vect2': 1.0, }, ), TfidfTransformer(use_idf=True), RandomUnderSampler(ratio={ 1: 19000, 2: 27200, 3: 20000 }, random_state=22), SelectFromModel( estimator=LinearSVC(), threshold='1.2*mean'), # Dimensionality Reduction #MLPClassifier(verbose=True, hidden_layer_sizes=(200,), max_iter=200, solver='sgd', learning_rate='adaptive', learning_rate_init=0.60, momentum=0.50, alpha=1e-01),) MLPClassifier(verbose=True, random_state=22, hidden_layer_sizes=(100, ), max_iter=200, solver='sgd', learning_rate='constant', learning_rate_init=0.07, momentum=0.90, alpha=1e-01), )
def __init__(self): self.clf = make_pipeline_imb(Imputer(strategy='median'), RandomUnderSampler(), LogisticRegression(C=1e-3, penalty="l2"))
def __init__(self): self.clf = make_pipeline_imb( Imputer(strategy='median'), RandomUnderSampler(), RandomForestClassifier(10, verbose=True, min_impurity_decrease=10e-5))
print(classification_report_imbalanced(y_test, y_pred)) ############################################################################### # Balancing the class before classification ############################################################################### ############################################################################### # To improve the prediction of the class \#3, it could be interesting to apply # a balancing before to train the naive bayes classifier. Therefore, we will # use a ``RandomUnderSampler`` to equalize the number of samples in all the # classes before the training. # # It is also important to note that we are using the ``make_pipeline`` function # implemented in imbalanced-learn to properly handle the samplers. pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB()) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) ############################################################################### # Although the results are almost identical, it can be seen that the resampling # allowed to correct the poor recall of the class \#3 at the cost of reducing # the other metrics for the other classes. However, the overall results are # slightly better. print(classification_report_imbalanced(y_test, y_pred))
x = data_train2.values x = StandardScaler().fit_transform(x) from sklearn.decomposition import PCA pca = PCA(n_components=254) principalComponents = pca.fit_transform(x) var= pca.explained_variance_ratio_ #Cumulative Variance explains var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100) plt.plot(var1) # consider the top 150 components as it cross >80% vriance ----------------------------------------------------- from imblearn.over_sampling import SMOTE from sklearn.pipeline import make_pipeline from imblearn.pipeline import make_pipeline as make_pipeline_imb classifier = RandomForestClassifier smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), classifier(min_samples_split=25,n_estimators=700,random_state=42)) smote_model = smote_pipeline.fit(train_features, train_labels) smote_prediction = smote_model.predict(test_features) confusion_matrix(test_labels,smote_prediction) #oversampler=SMOTE(random_state=0) #os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
def perform(emotion, train_tweets, y_train, task_name, k, estimator_dict, vectorizer_dict): parent_dir = Path.cwd().parent pipelines_dir = parent_dir.joinpath('new_results', 'pipelines_' + emotion) Writer.check_for_directory(pipelines_dir) #Select the scoring metric, depending upon the task name scoring = Dictionaries.scoring.get(task_name) # Perform the preprocessing and feature engineering tasks preprocess_train_df = Preprocessor.perform(train_tweets, emotion, 'train', task_name) trans_feat_train_df = Feature_Transformer.perform( preprocess_train_df, emotion, 'train', task_name) #Iterate through all the vectorizers for vect_name, vectorizer in vectorizer_dict.items(): # Convert the preprocessed text into feature vectors using vectorizer train_vect = vectorizer.fit_transform( preprocess_train_df['preprocessed_text'].values) train_vect_df = pd.DataFrame( train_vect.toarray(), columns=vectorizer.get_feature_names()) print('TRAIN_VECTORIZED') print(train_vect_df.shape) # Final training data: Merge Feature vector columns with transformed features columns -> X_train, X_test X_train = pd.concat([train_vect_df, trans_feat_train_df], axis=1) print( 'X_train, y_train with vector features + features transformed') print(X_train.shape, y_train.shape) #Iterate through all the estimators for estimator_name, estimator in estimator_dict.items(): ########################### CLASSIFICATION ################################## if (task_name == 'c'): # Default pipeline contains Feature selector + estimator, where as if k = 0(all_in), the pipeline doesnot contain the Feature selector pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), SelectKBest(chi2, k=k), estimator) if k == 0: pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), estimator) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) # Fit the same pipeline for train data and predict the results for X_test to find the test scores pipeline.fit(X_train, y_train) # Store the pipeline as pickle files with open( pipelines_dir.joinpath('class_model_anal_' + emotion + '_original_' + estimator_name + '_' + vect_name + '_' + str(k) + '.pkl'), 'wb') as infile: pick.dump(pipeline, infile, pick.HIGHEST_PROTOCOL) infile.close() Writer.write_class_model_anal_results_in_file( emotion, 'original', estimator_name, vect_name, k, scores) ##################################### CLASSIFICATION RESAMPLING ################################################################### # Pipeline with resampler -SMOTE, TomekLinks, SMOTETomek for resampler_name, resampler in Dictionaries.resampler_dict.items( ): print(estimator_name, vect_name, resampler_name) pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), SelectKBest(chi2, k=k), resampler, estimator) if k == 0: pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), resampler, estimator) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) # Fit the same pipeline for train data and predict the results for fixed X_test to find the test scores pipeline.fit(X_train, y_train) # Store the pipeline as pickle files with open( pipelines_dir.joinpath('class_model_anal_' + emotion + '_' + resampler_name + '_' + estimator_name + '_' + vect_name + '_' + str(k) + '.pkl'), 'wb') as infile: pick.dump(pipeline, infile, pick.HIGHEST_PROTOCOL) infile.close() Writer.write_class_model_anal_results_in_file( emotion, resampler_name, estimator_name, vect_name, k, scores) gc.collect() ######################## REGRESSION ############################################# elif (task_name == 'r'): # Default pipeline contains Feature selector + estimator, where as if k = 0, the pipeline doesnot contain the Feature selector pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), SelectKBest(f_regression, k=k), estimator) if k == 0: pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), estimator) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) # Fit the same pipeline for train data and predict the results for fixed X_test to find the test scores pipeline.fit(X_train, y_train) # Store the pipeline as pickle files with open( pipelines_dir.joinpath('reg_model_anal_' + emotion + '_original_' + estimator_name + '_' + vect_name + '_' + str(k) + '.pkl'), 'wb') as infile: pick.dump(pipeline, infile, pick.HIGHEST_PROTOCOL) infile.close() Writer.write_reg_model_anal_results_in_file( emotion, estimator_name, vect_name, k, scores) gc.collect()
def train_the_best_models_again(model_properties): """ A method to train the classification best models using original and resampled dataset again """ parent_dir = Path.cwd().parent pickle_dir = parent_dir.joinpath('default_results', 'pickle_files_feat_eng') results_dir = parent_dir.joinpath('default_results', 'score_files') best_scores = {} scores_dict = {} for i, emotion in Dictionaries.emo_dict.items(): best_model_original = model_properties[emotion + '_class_original'] best_model_resampled = model_properties[emotion + '_class_resampled'] #Fit transform the vectorizer with the corresponding preprocessed training data if os.path.exists( pickle_dir.joinpath(emotion + '_c_train_preprocess_df.pkl')): preprocess_train_df = pd.read_pickle( pickle_dir.joinpath(emotion + '_c_train_preprocess_df.pkl')) trans_feat_train_df = pd.read_pickle( pickle_dir.joinpath(emotion + '_c_train_feat_transform_df.pkl')) #Use the corresponding vectorizer from the model properties to vectorize train_vect_original = Dictionaries.vectorizer_dict[ best_model_original[2]].fit_transform( preprocess_train_df['preprocessed_text'].values) train_vect_df_original = pd.DataFrame( train_vect_original.toarray(), columns=Dictionaries.vectorizer_dict[ best_model_original[2]].get_feature_names()) train_vect_resampled = Dictionaries.vectorizer_dict[ best_model_resampled[2]].fit_transform( preprocess_train_df['preprocessed_text'].values) train_vect_df_resampled = pd.DataFrame( train_vect_resampled.toarray(), columns=Dictionaries.vectorizer_dict[ best_model_resampled[2]].get_feature_names()) #merge vectorized features and transformed features X_train_original = pd.DataFrame( pd.concat([train_vect_df_original, trans_feat_train_df], axis=1)) X_train_resampled = pd.DataFrame( pd.concat([train_vect_df_resampled, trans_feat_train_df], axis=1)) y_train = preprocess_train_df['Affect Dimension'].astype( 'category').cat.rename_categories({ emotion: 1, 'other': 0 }) # pipeline for original dataset pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), SelectKBest(chi2, k=int(best_model_original[3])), Dictionaries.classifier_dict[best_model_original[1]]) if best_model_original[3] == 0: pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), Dictionaries.classifier_dict[best_model_original[1]]) y_pred_original = cross_val_predict(pipeline, X_train_original, y_train, cv=5) # pipeline for resampled dataset pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), SelectKBest(chi2, k=int(best_model_resampled[3])), Dictionaries.resampler_dict[best_model_resampled[0]], Dictionaries.classifier_dict[best_model_resampled[1]]) if best_model_resampled[3] == 0: pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), Dictionaries.resampler_dict[best_model_resampled[0]], Dictionaries.classifier_dict[best_model_resampled[1]]) y_pred_resampled = cross_val_predict(pipeline, X_train_resampled, y_train, cv=5) scores_original = classification_report(y_train, y_pred_original, labels=[1, 0], output_dict=True) accuracy_original = accuracy_score(y_train, y_pred_original) scores_resampled = classification_report(y_train, y_pred_resampled, labels=[1, 0], output_dict=True) accuracy_resampled = accuracy_score(y_train, y_pred_resampled) print(scores_original, scores_resampled) scores_dict[emotion + 'original'] = [scores_original, accuracy_original] scores_dict[emotion + 'resampled'] = [scores_resampled, accuracy_resampled] emo_f1_original = scores_original['1']['f1-score'] avg_f1_original = scores_original['macro avg']['f1-score'] emo_f1_resampled = scores_resampled['1']['f1-score'] avg_f1_resampled = scores_resampled['macro avg']['f1-score'] #Add the results needed for analysis to the dict best_scores[emotion] = [ avg_f1_original, emo_f1_original, accuracy_original, avg_f1_resampled, emo_f1_resampled, accuracy_resampled ] else: #If the file doesnt exist, exit the program with instructions print( '\nRequired files does not exist.\n\n Please, train the models first by running > Modelling.py and add the files created in \'default_results\' folder' ) sys.exit(1) # store the classification report and accuracy of both the models with open(results_dir.joinpath('best_class_both_model_scores.pkl'), 'wb') as outfile: pickle.dump(scores_dict, outfile) return best_scores
#from sklearn.linear_model import LogisticRegression #from sklearn.ensemble import RandomForestClassifier #from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix, precision_recall_curve, accuracy_score classifier = RandomForestClassifier # build model with SMOTE imblearn smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), \ classifier(random_state=42)) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test) #Showing the diference before and after the transformation used print("normal data distribution: {}".format(Counter(y))) X_smote, y_smote = SMOTE().fit_sample(X, y) print("SMOTE data distribution: {}".format(Counter(y_smote))) print("Confusion Matrix: ") print(confusion_matrix(y_test, smote_prediction)) print('\nSMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test))) import sklearn.metrics as met
####################### imbalance learn #################################### from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler from imblearn.combine import SMOTEENN from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier from imblearn.pipeline import make_pipeline as make_pipeline_imb from imblearn.metrics import classification_report_imbalanced imb_run = False if imb_run: print('****************** imbalance learn ****************') clf = RandomForestClassifier() print('************** RandomUnderSampler ***********') pipe = make_pipeline_imb(vect, RandomUnderSampler(random_state=777), clf) pipe.fit(X_train, y_train) preds = pipe.predict(X_val) preds_train = pipe.predict(X_train) print(classification_report_imbalanced(y_val, preds)) print('************** RandomOverSampler ***********') pipe = make_pipeline_imb(vect, RandomOverSampler(random_state=777), clf) pipe.fit(X_train, y_train) preds = pipe.predict(X_val) preds_train = pipe.predict(X_train) print(classification_report_imbalanced(y_val, preds)) print('************** SMOTEENN(combine) ***********') pipe = make_pipeline_imb(vect, SMOTEENN(random_state=42), clf) pipe.fit(X_train, y_train)
# Neural Network evaluatemodel(MLPClassifier(random_state=2), "MLP") # Random Forest def print_results(headline, true_value, pred): print(headline) print("accuracy: {}".format(accuracy_score(true_value, pred))) print("precision: {}".format(precision_score(true_value, pred))) print("recall: {}".format(recall_score(true_value, pred))) print("f2: {}".format(fbeta_score(true_value, pred, beta=2))) # splitting data into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4, test_size=0.20) # build model with SMOTE imblearn smote_pipeline = make_pipeline_imb(SMOTE(random_state=6), RandomForestClassifier(random_state=2)) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test) print("normal data distribution: {}".format(Counter(y))) X_smote, y_smote = SMOTE().fit_sample(X, y) print("SMOTE data distribution: {}".format(Counter(y_smote))) print("Confusion Matrix: ") print(confusion_matrix(y_test, smote_prediction)) #plot_confusion_matrix(confusion_matrix(y_test, smote_prediction)) print('\nSMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test))) print_results("\nSMOTE + RandomForest classification", y_test, smote_prediction)
# Balancing the class before classification # ----------------------------------------- # # To improve the prediction of the class \#3, it could be interesting to apply # a balancing before to train the naive bayes classifier. Therefore, we will # use a :class:`~imblearn.under_sampling.RandomUnderSampler` to equalize the # number of samples in all the classes before the training. # # It is also important to note that we are using the # :class:`~imblearn.pipeline.make_pipeline` function implemented in # imbalanced-learn to properly handle the samplers. # %% from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import make_pipeline as make_pipeline_imb model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB()) model.fit(X_train, y_train) y_pred = model.predict(X_test) # %% [markdown] # Although the results are almost identical, it can be seen that the resampling # allowed to correct the poor recall of the class \#3 at the cost of reducing # the other metrics for the other classes. However, the overall results are # slightly better. # %% print(classification_report_imbalanced(y_test, y_pred))
df_working['N-S'].values, test_size=0.30, random_state=42) # # Thinking - Feeling X_train_TF, X_test_TF, y_train_TF, y_test_TF = train_test_split(df_working['posts'].values, df_working['T-F'].values, test_size=0.30, random_state=42) # # Judging - Perceiving X_train_JP, X_test_JP, y_train_JP, y_test_JP = train_test_split(df_working['posts'].values, df_working['J-P'].values, test_size=0.30, random_state=42) # setting up model pipe = make_pipeline_imb(TfidfVectorizer(ngram_range=(1,2),norm='l1',max_features=100), RandomUnderSampler(random_state=420), RandomForestClassifier(min_samples_leaf=1, min_samples_split=6, n_estimators=120, criterion='gini', bootstrap='False', n_jobs= -1)) # training model pipe.fit(X_train_JP, y_train_JP) y_pred = pipe.predict(X_test_JP) probablity=pipe.predict_proba(X_test_JP) # Model Accuracy print("Random forest Accuracy:", accuracy_score(y_test_JP, y_pred)) print(classification_report_imbalanced(y_test_JP, y_pred)) # pickle_out = open("model_f.pickle","wb") # pickle.dump(pipe, pickle_out)
print('Total time - Without Undersampling: ', end - start, ' seconds\n') print(metrics.classification_report(y_validation, validation_result)) print() print('Without Undersampling - Pipeline Score {}'.format(multiC.fit(X_train, y_train).score(X_validation, y_validation))) print() print_results("Without Undersampling - Validation set: ", true_validation, validation_result) print('===============================Without Undersampling Ends===============================\n') print('================================With Undersampling Starts===============================\n') start = time.time() # build model with undersampling nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=0), multiC) nearmiss_model = nearmiss_pipeline.fit(X_train, y_train) nearmiss_prediction = nearmiss_model.predict(X_validation) # Print the distribution of labels about both models print() print("Without Undersampling - data distribution: {}".format(Counter(y_train))) X_nearmiss, y_nearmiss = NearMiss(random_state = 0).fit_sample(X_train, y_train) print("With Undersampling - data distribution: {}".format(Counter(y_nearmiss))) print() end = time.time() # Here comes the result with Undersampling print('Total time - With Undersampling: ', end - start, ' seconds\n') print(classification_report_imbalanced(y_validation, nearmiss_prediction))
def cross_validate(model_name, X, y): # y = y.reset_index() y = y.as_matrix() kf = KFold(n_splits=5, random_state=42) accuracy = [] precision = [] recall = [] f1 = [] auc = [] if debug: print("in cross_validate: 1, size of X, y: ", len(X), len(y)) print("y type is: ", type(y)) if 'svc' in model_name.lower(): classifier = SVC(kernel='linear', probability=True) elif 'random forest' in model_name.lower() or 'rf' in model_name.lower(): classifier = RandomForestClassifier(n_estimators=200) elif 'logistic regression' in model_name.lower( ) or 'lr' in model_name.lower(): classifier = LogisticRegression() try: for train_indices, test_indices in kf.split(X): X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices] # y_train, y_test = y[1293:6460], y[0:1292] if 'smote' in model_name.lower(): pipeline = make_pipeline_imb(SMOTE(), classifier) else: pipeline = make_pipeline_imb(RandomOverSampler(random_state=0), classifier) if debug: print("in cross_validate: 2, size of X_train, y_train: ", len(X_train), len(y_train)) # print("train size, test size: ", len( # train_indices), len(test_indices)) model = pipeline.fit(X_train, y_train) # if debug: # print("pipeline returns: ", pipeline.transform(X_train)) prediction = model.predict(X_test) accuracy.append(pipeline.score(X_test, y_test)) precision.append(precision_score(y_test, prediction, average=None)) recall.append(recall_score(y_test, prediction, average=None)) f1.append(f1_score(y_test, prediction, average=None)) except Exception as e: print("error in k-fold validate: ", e) print("X[train] is: ", X_train) print("Y[train] is: ", y_train) print(f"k-fold accuracy: {accuracy}") print(f"k-fold recall: {recall}") print(f"k-fold precision: {precision}") print(f"k-fold f1: {f1}") return accuracy, precision, recall, f1