def nonnegative_matrix_factorization(data): param_grid = { 'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__min_df': [1, 2], 'vec__max_df': [0.5, 0.75, 0.9, 0.95], 'nmf__n_components': [10, 20, 30], 'nmf__beta_loss': ['kullback-leibler'], 'nmf__solver': ['mu'], 'nmf__max_iter': [1000], 'nmf__alpha': [0.0, 0.1], # 'nmf__l1_ratio': [0.5], 'model': [RandomForestClassifier], 'clf__random_state': [0], 'clf__n_estimators': [150, 200, 300, 500], 'clf__max_depth': [10, 15, None] } results = mlearning.cross_validate( mlearning.nonnegative_matrix_factorization, ParameterGrid(param_grid), data['Sentence'], data['Sentiment']) print('--- NMF results ---') print(results .sort_values(by='balanced_accuracy_score', ascending=False) .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1) .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1) .head(10)) mlearning.characterize_optimum(results, data, mlearning.nonnegative_matrix_factorization)
def latent_dirichlet_allocation(data): param_grid = { 'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1)], # (1, 2) doesnt perform well 'vec__min_df': [1, 2], 'vec__max_df': [0.4, 0.5, 0.6], 'lda__n_components': [10, 20, 30], 'lda__learning_offset': [10., 20., 40.], 'lda__random_state': [0], 'model': [RandomForestClassifier], 'clf__random_state': [0], 'clf__n_estimators': [100, 150, 200], 'clf__max_depth': [10, 15, None] } results = mlearning.cross_validate( mlearning.latent_dirichlet_allocation, ParameterGrid(param_grid), data['Sentence'], data['Sentiment']) print('--- LDA results ---') print(results .sort_values(by='balanced_accuracy_score', ascending=False) .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1) .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1) .head(10)) mlearning.characterize_optimum(results, data, mlearning.latent_dirichlet_allocation)
def bag_of_words(data): param_grid = [ {'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__min_df': [3, 4, 5], 'vec__max_df': [0.9, 0.95], 'model': [LinearSVC], 'clf__max_iter': [10000], 'clf__dual': [False, True], 'clf__class_weight': ['balanced'], 'clf__random_state': [0]}, {'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__min_df': [1, 2, 3, 4], 'vec__max_df': [0.5, 0.75, 0.9, 0.95], 'model': [MultinomialNB, ComplementNB, BernoulliNB], 'clf__alpha': [0.001, 0.01, 0.1, 1.0]} ] results = mlearning.cross_validate( mlearning.bag_of_words, ParameterGrid(param_grid), data['Sentence'], data['Sentiment'] ).sort_values(by='balanced_accuracy_score', ascending=False) print('--- Bag-of-words results ---') print(results .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1) .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1) .head(20)) mlearning.characterize_optimum(results, data, mlearning.bag_of_words)
def tf_idf(data): param_grid = [ {'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__min_df': [3, 4, 5], 'vec__max_df': [0.5, 0.75, 0.9], # 'vec__sublinear_tf': [True, False], 'model': [LinearSVC], 'clf__max_iter': [10000], 'clf__dual': [False, True], 'clf__class_weight': ['balanced'], 'clf__random_state': [0]}, {'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__min_df': [1, 2, 3, 4], 'vec__max_df': [0.5, 0.75, 0.9, 0.95], 'vec__sublinear_tf': [True, False], 'model': [MultinomialNB, ComplementNB, BernoulliNB], 'clf__alpha': [0.001, 0.01, 0.1, 1.0]} ] results = mlearning.cross_validate( mlearning.tf_idf, ParameterGrid(param_grid), data['Sentence'], data['Sentiment'] ).sort_values(by='balanced_accuracy_score', ascending=False) print('--- Tf-Idf results ---') print(results .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1) .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1) .head(20)) mlearning.characterize_optimum(results, data, mlearning.tf_idf) # Confusion matrix optimum = results.loc[results['balanced_accuracy_score'].idxmax()] plotting.confusion_heatmap(optimum['confusion_matrix'], sentiments) plotting.save_figure(plt.gcf(), 'confusion_heatmap') plotting.confusion_heatmap(optimum['confusion_matrix'], sentiments, True) plotting.save_figure(plt.gcf(), 'confusion_heatmap_norm') # Hyperparameter dependency evaluate_parameter(results, 'clf__alpha', 'tfidf', ylabel='Balanced accuracy score', xlabel='Alpha', log=True) evaluate_parameter(results, 'vec__max_df', 'tfidf', ylabel='Balanced accuracy score', xlabel='Max doc frequency') evaluate_parameter(results, 'vec__min_df', 'tfidf', ylabel='Balanced accuracy score', xlabel='Min doc frequency') evaluate_parameter(results, 'vec__ngram_range', 'tfidf', ylabel='Balanced accuracy score', xlabel='n-gram range', categorical=True) evaluate_parameter( results, 'model', 'tfidf', ylabel='Balanced accuracy score', categorical=True, xticklabels=lambda x: str(x).split("'")[1].split('.')[-1])
def latent_semantic_analysis(data): param_grid = [ {'vec__stop_words': [None, 'english'], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__min_df': [1, 2], 'vec__max_df': [0.5, 0.75, 0.9, 0.95], 'vec__sublinear_tf': [True, False], 'lda__n_components': [10, 20, 30], 'model': [RandomForestClassifier], 'clf__random_state': [0], 'clf__n_estimators': [150, 200, 300, 500], 'clf__max_depth': [10, 15, None]}, # Not close enough # {'vec__stop_words': [None, 'english'], # 'vec__ngram_range': [(1, 1), (1, 2)], # 'vec__min_df': [1, 2], # 'lda__n_components': [30, 50, 70], # 'model': [LinearSVC], # 'clf__max_iter': [10000], # 'clf__dual': [False, True], # 'clf__class_weight': ['balanced'], # 'clf__random_state': [0]}, # Does not even come close # {'vec__stop_words': [None, 'english'], # 'vec__ngram_range': [(1, 1), (1, 2)], # 'vec__min_df': [1, 2], # 'lda__n_components': [30, 50, 70], # 'model': [LogisticRegression], # 'clf__multi_class': ['multinomial'], # 'clf__solver': ['newton-cg']} ] results = mlearning.cross_validate( mlearning.latent_semantic_analysis, ParameterGrid(param_grid), data['Sentence'], data['Sentiment']) print('--- LSA results ---') print(results .sort_values(by='balanced_accuracy_score', ascending=False) .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix'], axis=1) .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1) .head(10)) mlearning.characterize_optimum(results, data, mlearning.latent_semantic_analysis)
def word2vec(data): param_grid = { 'tok__lowercase': [True], 'tok__deacc': [True], 'model': [RandomForestClassifier], 'clf__random_state': [0], 'clf__n_estimators': [100, 150, 200, 300, 500], 'clf__max_depth': [10, 15, None] } results = mlearning.cross_validate( mlearning.word2vec, ParameterGrid(param_grid), data['Sentence'], data['Sentiment']) print('--- W2V results ---') print(results .sort_values(by='balanced_accuracy_score', ascending=False) .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix'], axis=1) .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1) .head(10)) mlearning.characterize_optimum(results, data, mlearning.word2vec)