Esempio n. 1
0
def nonnegative_matrix_factorization(data):
    param_grid = {
        'vec__stop_words': [None, 'english'],
        'vec__ngram_range': [(1, 1), (1, 2)],
        'vec__min_df': [1, 2],
        'vec__max_df': [0.5, 0.75, 0.9, 0.95],
        'nmf__n_components': [10, 20, 30],
        'nmf__beta_loss': ['kullback-leibler'],
        'nmf__solver': ['mu'],
        'nmf__max_iter': [1000],
        'nmf__alpha': [0.0, 0.1],
        # 'nmf__l1_ratio': [0.5],
        'model': [RandomForestClassifier],
        'clf__random_state': [0],
        'clf__n_estimators': [150, 200, 300, 500],
        'clf__max_depth': [10, 15, None]
    }

    results = mlearning.cross_validate(
        mlearning.nonnegative_matrix_factorization, ParameterGrid(param_grid),
        data['Sentence'], data['Sentiment'])
    print('--- NMF results ---')
    print(results
          .sort_values(by='balanced_accuracy_score', ascending=False)
          .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1)
          .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1)
          .head(10))
    mlearning.characterize_optimum(results, data, mlearning.nonnegative_matrix_factorization)
Esempio n. 2
0
def latent_dirichlet_allocation(data):
    param_grid = {
        'vec__stop_words': [None, 'english'],
        'vec__ngram_range': [(1, 1)],  # (1, 2) doesnt perform well
        'vec__min_df': [1, 2],
        'vec__max_df': [0.4, 0.5, 0.6],
        'lda__n_components': [10, 20, 30],
        'lda__learning_offset': [10., 20., 40.],
        'lda__random_state': [0],
        'model': [RandomForestClassifier],
        'clf__random_state': [0],
        'clf__n_estimators': [100, 150, 200],
        'clf__max_depth': [10, 15, None]
    }

    results = mlearning.cross_validate(
        mlearning.latent_dirichlet_allocation, ParameterGrid(param_grid),
        data['Sentence'], data['Sentiment'])
    print('--- LDA results ---')
    print(results
          .sort_values(by='balanced_accuracy_score', ascending=False)
          .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1)
          .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1)
          .head(10))

    mlearning.characterize_optimum(results, data, mlearning.latent_dirichlet_allocation)
Esempio n. 3
0
def bag_of_words(data):
    param_grid = [
        {'vec__stop_words': [None, 'english'],
         'vec__ngram_range': [(1, 1), (1, 2)],
         'vec__min_df': [3, 4, 5],
         'vec__max_df': [0.9, 0.95],
         'model': [LinearSVC],
         'clf__max_iter': [10000],
         'clf__dual': [False, True],
         'clf__class_weight': ['balanced'],
         'clf__random_state': [0]},

        {'vec__stop_words': [None, 'english'],
         'vec__ngram_range': [(1, 1), (1, 2)],
         'vec__min_df': [1, 2, 3, 4],
         'vec__max_df': [0.5, 0.75, 0.9, 0.95],
         'model': [MultinomialNB, ComplementNB, BernoulliNB],
         'clf__alpha': [0.001, 0.01, 0.1, 1.0]}
    ]

    results = mlearning.cross_validate(
        mlearning.bag_of_words, ParameterGrid(param_grid),
        data['Sentence'], data['Sentiment']
    ).sort_values(by='balanced_accuracy_score', ascending=False)
    print('--- Bag-of-words results ---')
    print(results
          .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1)
          .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1)
          .head(20))

    mlearning.characterize_optimum(results, data, mlearning.bag_of_words)
Esempio n. 4
0
def tf_idf(data):
    param_grid = [
        {'vec__stop_words': [None, 'english'],
         'vec__ngram_range': [(1, 1), (1, 2)],
         'vec__min_df': [3, 4, 5],
         'vec__max_df': [0.5, 0.75, 0.9],
         # 'vec__sublinear_tf': [True, False],
         'model': [LinearSVC],
         'clf__max_iter': [10000],
         'clf__dual': [False, True],
         'clf__class_weight': ['balanced'],
         'clf__random_state': [0]},

        {'vec__stop_words': [None, 'english'],
         'vec__ngram_range': [(1, 1), (1, 2)],
         'vec__min_df': [1, 2, 3, 4],
         'vec__max_df': [0.5, 0.75, 0.9, 0.95],
         'vec__sublinear_tf': [True, False],
         'model': [MultinomialNB, ComplementNB, BernoulliNB],
         'clf__alpha': [0.001, 0.01, 0.1, 1.0]}
    ]

    results = mlearning.cross_validate(
        mlearning.tf_idf, ParameterGrid(param_grid),
        data['Sentence'], data['Sentiment']
    ).sort_values(by='balanced_accuracy_score', ascending=False)
    print('--- Tf-Idf results ---')
    print(results
          .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix', 'parameters'], axis=1)
          .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1)
          .head(20))

    mlearning.characterize_optimum(results, data, mlearning.tf_idf)

    # Confusion matrix
    optimum = results.loc[results['balanced_accuracy_score'].idxmax()]
    plotting.confusion_heatmap(optimum['confusion_matrix'], sentiments)
    plotting.save_figure(plt.gcf(), 'confusion_heatmap')
    plotting.confusion_heatmap(optimum['confusion_matrix'], sentiments, True)
    plotting.save_figure(plt.gcf(), 'confusion_heatmap_norm')

    # Hyperparameter dependency
    evaluate_parameter(results, 'clf__alpha', 'tfidf',
                       ylabel='Balanced accuracy score', xlabel='Alpha', log=True)
    evaluate_parameter(results, 'vec__max_df', 'tfidf',
                       ylabel='Balanced accuracy score', xlabel='Max doc frequency')
    evaluate_parameter(results, 'vec__min_df', 'tfidf',
                       ylabel='Balanced accuracy score', xlabel='Min doc frequency')
    evaluate_parameter(results, 'vec__ngram_range', 'tfidf',
                       ylabel='Balanced accuracy score', xlabel='n-gram range', categorical=True)
    evaluate_parameter(
        results, 'model', 'tfidf', ylabel='Balanced accuracy score', categorical=True,
        xticklabels=lambda x: str(x).split("'")[1].split('.')[-1])
Esempio n. 5
0
def latent_semantic_analysis(data):
    param_grid = [
        {'vec__stop_words': [None, 'english'],
         'vec__ngram_range': [(1, 1), (1, 2)],
         'vec__min_df': [1, 2],
         'vec__max_df': [0.5, 0.75, 0.9, 0.95],
         'vec__sublinear_tf': [True, False],
         'lda__n_components': [10, 20, 30],
         'model': [RandomForestClassifier],
         'clf__random_state': [0],
         'clf__n_estimators': [150, 200, 300, 500],
         'clf__max_depth': [10, 15, None]},

        # Not close enough
        # {'vec__stop_words': [None, 'english'],
        #  'vec__ngram_range': [(1, 1), (1, 2)],
        #  'vec__min_df': [1, 2],
        #  'lda__n_components': [30, 50, 70],
        #  'model': [LinearSVC],
        #  'clf__max_iter': [10000],
        #  'clf__dual': [False, True],
        #  'clf__class_weight': ['balanced'],
        #  'clf__random_state': [0]},

        # Does not even come close
        # {'vec__stop_words': [None, 'english'],
        #  'vec__ngram_range': [(1, 1), (1, 2)],
        #  'vec__min_df': [1, 2],
        #  'lda__n_components': [30, 50, 70],
        #  'model': [LogisticRegression],
        #  'clf__multi_class': ['multinomial'],
        #  'clf__solver': ['newton-cg']}
    ]

    results = mlearning.cross_validate(
        mlearning.latent_semantic_analysis, ParameterGrid(param_grid),
        data['Sentence'], data['Sentiment'])
    print('--- LSA results ---')
    print(results
          .sort_values(by='balanced_accuracy_score', ascending=False)
          .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix'], axis=1)
          .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1)
          .head(10))
    mlearning.characterize_optimum(results, data, mlearning.latent_semantic_analysis)
Esempio n. 6
0
def word2vec(data):
    param_grid = {
        'tok__lowercase': [True],
        'tok__deacc': [True],
        'model': [RandomForestClassifier],
        'clf__random_state': [0],
        'clf__n_estimators': [100, 150, 200, 300, 500],
        'clf__max_depth': [10, 15, None]
    }

    results = mlearning.cross_validate(
        mlearning.word2vec, ParameterGrid(param_grid),
        data['Sentence'], data['Sentiment'])
    print('--- W2V results ---')
    print(results
          .sort_values(by='balanced_accuracy_score', ascending=False)
          .drop(['accuracy_score', 'accuracy_score_std', 'confusion_matrix'], axis=1)
          .rename({'balanced_accuracy_score': 'bas', 'balanced_accuracy_score_std': 'bas_std'}, axis=1)
          .head(10))
    mlearning.characterize_optimum(results, data, mlearning.word2vec)