def ML_analysis(X_train, X_test, y_train, y_test, data): X_train_counts, count_vectorizer = mo.vectorizer( X_train, (2, 2)) # transforming testing data into document-term matrix X_test_counts = count_vectorizer.transform( X_test) pipeline = Pipeline( [('clf', LogisticRegression(solver='saga', multi_class='multinomial', class_weight='balanced'))]) parameters = {'clf__penalty': ['l1', 'l2'], 'clf__C': [0.001, .009, 0.01, .09, 1, 2, 5, 10, 25, 30, 40]} # scorers = { # 'precision_score': make_scorer(precision_score), # 'recall_score': make_scorer(recall_score), # 'accuracy_score': make_scorer(accuracy_score)} scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy'] grid_search = RandomizedSearchCV(pipeline, parameters, cv=5, n_iter=10, n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro') # grid_search.fit(X_train_counts, y_train) grid_search.best_score_ best_parameters = grid_search.best_params_ print(best_parameters) print(grid_search.best_score_) predicted_level = grid_search.predict(X_test_counts) print(grid_search.best_estimator_) print_summary(y_test, predicted_level, data, "Cat_level") plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index, title='Confusion matrix, without normalization')
def ML_analysis(X_train, y_train, data, colum_name): X_train_counts, count_vectorizer = mo.vectorizer( X_train, (1, 1)) logreg = LogisticRegression( ) # C=30.0, class_weight='balanced', solver='newton-cg', # multi_class='multinomial', n_jobs=-1, random_state=40) #ist_features = X_train_counts #list_labels = data[colum_name].tolist() predicted = cross_val_predict(logreg, X_train_counts, y_train, cv=3) print(metrics.accuracy_score(y_train, predicted)) print(metrics.classification_report(y_train, predicted))
def ML_analysis(X_train, X_test, y_train, y_test, data): X_train_counts, count_vectorizer = mo.vectorizer(X_train, (1, 2)) X_test_counts = count_vectorizer.transform(X_test) logreg = LogisticRegression( C=1) # C=30.0, class_weight='balanced', solver='newton-cg', # multi_class='multinomial', n_jobs=-1, random_state=40) predicted_level = mo.train_model(logreg, X_train_counts, y_train, X_test_counts) print_summary(y_test, predicted_level, data, "Cat_level") plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index, title='Confusion matrix, without normalization')
Real_data = pd.read_csv('/Users/ruddirodriguez/Dropbox/Machine_Learning/NLP/' + real_data_name) real_data_clean = preparing_data(Real_data) training_data_clean = preparing_data(training_data) #real_data_clean = real_data_clean[real_data_clean['Level'] ==5] #training_data_clean = training_data_clean[( # training_data_clean['Level'] == 5) | (training_data_clean['Level'] == 2)] #fig = plt.figure(figsize=(8, 6)) #plot.number_of_levels(real_data_clean) # #fig = plt.figure(figsize=(8, 6)) #plot.number_of_levels(training_data_clean) X_train_counts, count_vectorizer = mo.vectorizer( training_data_clean['lemmatizing'].tolist(), (1, 1)) # transforming testing data into document-term matrix X_test_counts = count_vectorizer.transform( real_data_clean['lemmatizing'].tolist()) y_train = training_data_clean["Level"].tolist() y_test = real_data_clean["Level"].tolist() NB = MultinomialNB(alpha=0.5) predicted_level = mo.train_model(NB, X_train_counts, y_train, X_test_counts) print_summary(y_test, predicted_level, training_data_clean, "Cat_level") accuracy, precision, recall, harmonic_mean = mo.get_metrics(
# #worcloud_generation (text) ## Create and generate a word cloud image: #wordDict = { #'data': '', #'year': '', #'science':''} #text_copy = multipleReplace(text, wordDict) #worcloud_generation (text_copy) list_data_frames = [] cc = 0 for i in training_data_clean['Cat_level'].value_counts().index: cc += 1 data = training_data_clean[training_data_clean['Level'] == cc] X_train_counts, count_vectorizer = mo.vectorizer( data['lemmatizing'].tolist(), (1, 2)) freqss = pd.DataFrame( [(word, X_train_counts.getcol(idx).sum(), i) for word, idx in count_vectorizer.vocabulary_.items()], columns=['word', 'Freq', 'Level']).sort_values(by='Freq', ascending=False) list_data_frames.append(freqss) text = " ".join(item for item in data['lemmatizing'].tolist()) wordDict = {'data': '', 'year': '', 'science': ''} savepath = ( "/Users/ruddirodriguez/Dropbox/Machine_Learning/Data_Science_day_UU/" + i + "." + "svg") text_copy = multipleReplace(text, wordDict) worcloud_generation(text_copy, savepath)