def ML_analysis(X_train, X_test, y_train, y_test, data): X_train_counts, count_vectorizer = mo.vectorizer( X_train, (2, 2)) # transforming testing data into document-term matrix X_test_counts = count_vectorizer.transform( X_test) pipeline = Pipeline( [('clf', LogisticRegression(solver='saga', multi_class='multinomial', class_weight='balanced'))]) parameters = {'clf__penalty': ['l1', 'l2'], 'clf__C': [0.001, .009, 0.01, .09, 1, 2, 5, 10, 25, 30, 40]} # scorers = { # 'precision_score': make_scorer(precision_score), # 'recall_score': make_scorer(recall_score), # 'accuracy_score': make_scorer(accuracy_score)} scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy'] grid_search = RandomizedSearchCV(pipeline, parameters, cv=5, n_iter=10, n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro') # grid_search.fit(X_train_counts, y_train) grid_search.best_score_ best_parameters = grid_search.best_params_ print(best_parameters) print(grid_search.best_score_) predicted_level = grid_search.predict(X_test_counts) print(grid_search.best_estimator_) print_summary(y_test, predicted_level, data, "Cat_level") plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index, title='Confusion matrix, without normalization')
def ML_analysis_split(cleaned_data, column_target,classifier,label=None): # Leave it as a dataframe because our pipeline is called on a # pandas dataframe to extract the appropriate columns, remember? X = cleaned_data.drop(column_target, axis=1) # You can covert the target variable to numpy y = cleaned_data[column_target].values full_pipeline = Pipeline(steps=[('pre_regular_exp', CleaningTextRegularExp('Description')), ('pre_stop_words', removing_stop_words('Description','english')), ('Pre_selector', FeatureSelector('Description')), ('vectorized', CountVectorizer()), ]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # The full pipeline as a step in another pipeline with an estimator as the final step full_pipeline_m = Pipeline(steps=[('full_pipeline', full_pipeline), ('model', classifier())]) # Can call fit on it just like any other pipeline full_pipeline_m.fit(X_train, y_train) # Can predict with it like any other pipeline y_pred = full_pipeline_m.predict(X_test) print_summary(y_test, y_pred, cleaned_data, label)
def ML_analysis(X_train, X_test, y_train, y_test, data): # transforming testing data into document-term matrix # X_test_counts = count_vectorizer.transform( # X_test) # ############################################################################# # Define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([('vectorized', CountVectorizer()), ('clf', LogisticRegression(solver='saga', multi_class='multinomial', class_weight='balanced'))]) param_grid = { 'vectorized__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (2, 3)], 'vectorized__max_features': [300, 1000, 5000, 8000, 9000], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1, 2, 5, 10, 25, 30, 40] } # scorers = { # 'precision_score': make_scorer(precision_score), # 'recall_score': make_scorer(recall_score), # 'accuracy_score': make_scorer(accuracy_score)} scoring = [ 'precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy' ] grid_search = RandomizedSearchCV(pipeline, param_grid, cv=10, n_iter=10, n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro') # grid_search.fit(X_train, y_train) # grid_search.best_score_ print(grid_search.best_params_) # print(grid_search.best_score_) predicted_level = grid_search.predict(X_test) # print(grid_search.best_estimator_) print_summary(y_test, predicted_level, data, "Cat_level") plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index, title='Confusion matrix, without normalization') best_parameters = grid_search.best_estimator_.get_params() print('Best Parameters are') for param_name in sorted(param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) return predicted_level, grid_search
def ML_analysis(X_train, X_test, y_train, y_test, data): X_train_counts, count_vectorizer = mo.vectorizer(X_train, (1, 2)) X_test_counts = count_vectorizer.transform(X_test) logreg = LogisticRegression( C=1) # C=30.0, class_weight='balanced', solver='newton-cg', # multi_class='multinomial', n_jobs=-1, random_state=40) predicted_level = mo.train_model(logreg, X_train_counts, y_train, X_test_counts) print_summary(y_test, predicted_level, data, "Cat_level") plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index, title='Confusion matrix, without normalization')
def ML_analysis_separated_data(training_cleaned_data, cleaned_test_data ,column_target,classifier,label="Level"): # Leave it as a dataframe because our pipeline is called on a # pandas dataframe to extract the appropriate columns, remember? X_train = training_cleaned_data.drop(column_target, axis=1) # You can covert the target variable to numpy y_train = training_cleaned_data[column_target].values X_test = cleaned_test_data.drop(column_target, axis=1) y_test = cleaned_test_data[column_target].values full_pipeline = Pipeline(steps=[('pre_regular_exp', CleaningTextRegularExp('Description')), ('pre_stop_words', removing_stop_words('Description','english')), ('pre_lemmatize', LemmatizeWord('Description')), ('Pre_selector', FeatureSelector('Description')), ('vectorized', CountVectorizer()), ('tfidf', TfidfTransformer()), ]) # The full pipeline as a step in another pipeline with an estimator as the final step full_pipeline_m = Pipeline(steps=[('full_pipeline', full_pipeline), ('model', classifier())]) # Can call fit on it just like any other pipeline full_pipeline_m.fit(X_train, y_train) # Can predict with it like any other pipeline y_pred = full_pipeline_m.predict(X_test) print_summary(y_test, y_pred, training_cleaned_data, label) plot.plot_confusion_matrix(y_test, y_pred, classes=training_cleaned_data.groupby( 'Cat_level').count().index, title='Confusion matrix, without normalization')
def ML_analysis(X_train, X_test, y_train, y_test, data): # Encoding data to apply machine learning tools y_train_encoder = md.data_encoder(y_train) y_test_encoder = md.data_encoder(y_test) # setting stop words nltk.download("stopwords") stop = stopwords.words('english') # vectorization tfidf_vect = mo.vectorizer_Tfid(training_data_clean, "Description", stop_words=stop, token_pattern=r'\w{1,}', ngram_range=(1, 2), max_features=1000) X_train_counts = tfidf_vect.transform(X_train) X_test_counts = tfidf_vect.transform(X_test) logreg = LogisticRegression( ) # C=30.0, class_weight='balanced', solver='newton-cg', # multi_class='multinomial', n_jobs=-1, random_state=40) predicted_level = mo.train_model(logreg, X_train_counts, y_train_encoder, X_test_counts) print_summary(y_test_encoder, predicted_level, training_data_clean, "Level") plot.plot_confusion_matrix(y_test_encoder, predicted_level, classes=data.groupby('Level').count().index, title='Confusion matrix, without normalization') plot.precision_number_training_data( training_data_clean, recall_score(y_test_encoder, predicted_level, average=None), 'Level')
X_train_counts, count_vectorizer = mo.vectorizer( training_data_clean['lemmatizing'].tolist(), (1, 1)) # transforming testing data into document-term matrix X_test_counts = count_vectorizer.transform( real_data_clean['lemmatizing'].tolist()) y_train = training_data_clean["Level"].tolist() y_test = real_data_clean["Level"].tolist() NB = MultinomialNB(alpha=0.5) predicted_level = mo.train_model(NB, X_train_counts, y_train, X_test_counts) print_summary(y_test, predicted_level, training_data_clean, "Cat_level") accuracy, precision, recall, harmonic_mean = mo.get_metrics( y_test, y_predicted_counts) #print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, harmonic_mean)) evaluation_list = { 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'Harmonic mean': harmonic_mean } print(evaluation_list)
def ML_analysis(X_train, X_test, y_train, y_test, data): # Encoding data to apply machine learning tools y_train_encoder = md.data_encoder(y_train) y_test_encoder = md.data_encoder(y_test) # setting stop words nltk.download("stopwords") stop = stopwords.words('english') pipeline = Pipeline([('vectorizer',TfidfVectorizer(stop_words = stop,token_pattern=r'\w{1,}')), ('clf', MultinomialNB(class_prior=None,fit_prior=False))]) #solver='saga', multi_class='multinomial', #class_weight='balanced' param_grid={'vectorizer__ngram_range': [(1,1),(1,2) ,(2,2)], 'vectorizer__max_features':[ 9000, 10000], 'clf__alpha': np.linspace(0.5, 1.5, 6, 7) } scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy'] #grid_search = RandomizedSearchCV(pipeline, param_grid, cv=50, n_iter=30, #n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro') grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro') # # grid_search.fit(X_train, y_train_encoder) # grid_search.best_score_ best_parameters = grid_search.best_params_ print(best_parameters) # print(grid_search.best_score_) predicted_level = grid_search.predict(X_test) print(grid_search.best_estimator_) print_summary(y_test_encoder, predicted_level, data, "Level") # predicted_level = mo.train_model( # logreg, X_train_counts, y_train_encoder, X_test_counts) # print_summary(y_test_encoder, predicted_level, training_data_clean, "Level") # plot.plot_confusion_matrix(y_test_encoder, predicted_level, classes=data.groupby( 'Level').count().index, title='Confusion matrix, without normalization') # plot.precision_number_training_data(training_data_clean,recall_score(y_test_encoder, predicted_level,average=None),'Level') accuracy, precision, recall, harmonic_mean = mo.get_metrics( y_test_encoder, predicted_level) # evaluation_list = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'Harmonic mean': harmonic_mean} # # print(evaluation_list) return predicted_level, grid_search