def test_show_sample_tweets():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )
    df_prep = {
        'tweets': [
            u'tweet 1 here', u'tweet 2 here', u'tweet 3 here', u'tweet 4 here',
            u'tweet 5 here'
        ],
        'label': [1, 1, 1, 0, 0],
        'y_pred': [1, 1, 0, 0, 0],
        'cluster_label': [1, 0, 0, 1, 1],
        'logistic_regression': [1, 1, 1, 1, 1],
        'naive_bayes': [0, 0, 0, 0, 0]
    }
    restructured_X_test_df = pd.DataFrame.from_dict(df_prep)
    test_instance.show_sample_tweets(
        restructured_X_test_df=restructured_X_test_df,
        cluster_label_list=[1, 0, 0, 1, 1])
def test_main():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    test_instance.main(feature_extraction_mode='uni_and_bigram',
                       bigram_min_count=3,
                       feature_represent_mode='tfidf',
                       feature_selection_mode='chi2',
                       top_n_feature=150,
                       lda_min_topic_num=3,
                       lda_max_topic_num=10,
                       lsi_min_topic_num=3,
                       lsi_max_topic_num=10,
                       min_cluster_number=1,
                       max_cluster_number=10,
                       number_of_cluster=1,
                       classifier='naive_bayes')
def test_baseline_model_builder():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    token_list_after_feature_selection = [
        [u'positive', u'tweet', u'one', u'pattern', u'here'],
        [u'the', u'second', u'token', u'pattern', u'here'],
        [u'negative', u'token', u'one', u'pattern', u'here'],
        [u'some', u'random', u'things', u'pattern', u'here']
    ]

    vectorizer, baseline_clf_dict, baseline_classifier_building_processing_time = test_instance.baseline_model_builder(
        token_list_after_feature_selection, mode='tfidf')

    assert isinstance(vectorizer, TfidfVectorizer)
    assert baseline_clf_dict.keys() == ['logistic_regression', 'naive_bayes']
    assert isinstance(baseline_clf_dict.values()[0],
                      linear_model.LogisticRegression)
    assert isinstance(baseline_clf_dict.values()[1], naive_bayes.MultinomialNB)
def test_get_tweet_topic_matrix_based_on_best_topic_model():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    token_list_after_feature_selection = [
        [u'positive', u'tweet', u'one', u'pattern', u'here'],
        [u'the', u'second', u'token', u'pattern', u'here'],
        [u'negative', u'token', u'one', u'pattern', u'here'],
        [u'some', u'random', u'things', u'pattern', u'here']
    ]

    top_model, model_topics, highest_coherence_score, dictionary, corpus, lad_lsi_processing_time = test_instance.build_lda_lsi_model(
        token_list_after_feature_selection,
        min_topic_num=2,
        max_topic_num=5,
        model='lda')

    tweet_topic_distribution_df = test_instance.get_tweet_topic_matrix_based_on_best_topic_model(
        top_model, corpus)

    assert isinstance(tweet_topic_distribution_df, DataFrame)
    assert tweet_topic_distribution_df.shape == (4, 2)
def test_classifier_building():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    token_list_after_feature_selection = [
        [u'positive', u'tweet', u'one', u'pattern', u'here'],
        [u'the', u'second', u'token', u'pattern', u'here'],
        [u'negative', u'token', u'one', u'pattern', u'here'],
        [u'some', u'random', u'things', u'pattern', u'here']
    ]

    tweet_topic_distribution_with_cluster_df = DataFrame(
        data={
            0: [0.135607, 0.882122, 0.12, 0.895495],
            1: [0.864393, 0.117878, 0.88, 0.104505],
            'Y': [1, 1, 0, 0],
            'clustering_labels': [1, 0, 1, 0]
        })

    vectorizer_clf_dict, classifier_building_processing_time = test_instance.classifier_building(
        tweet_topic_distribution_with_cluster_df, 2,
        token_list_after_feature_selection)

    assert vectorizer_clf_dict.keys() == [0, 1]  # the two topics
    assert isinstance(vectorizer_clf_dict[0][0], TfidfVectorizer)
    assert isinstance(vectorizer_clf_dict[0][1],
                      linear_model.LogisticRegression)
def test_collect_clustering_info():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    tweet_topic_distribution_df = DataFrame(
        data={
            0: [0.135607, 0.898807, 0.882122, 0.895495, 0.10, 0.20],
            1: [0.864393, 0.101193, 0.117878, 0.104505, 0.90, 0.80]
        })

    list_k, lable_list, model_list, collect_clustering_info_processing_time = test_instance.collect_clustering_info(
        tweet_topic_distribution_df,
        min_cluster_number=1,
        max_cluster_number=3)

    assert list_k == [1, 2, 3]

    # assert sorted(lable_list) == [[0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 1], [1, 0, 0, 0, 1, 2]]
    assert all([isinstance(model, KMeans) for model in model_list])
def test_build_lda_model():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    token_list_after_feature_selection = [
        [u'positive', u'tweet', u'one', u'pattern', u'here'],
        [u'the', u'second', u'token', u'pattern', u'here'],
        [u'negative', u'token', u'one', u'pattern', u'here'],
        [u'some', u'random', u'things', u'pattern', u'here']
    ]
    top_model, model_topics, highest_coherence_score, dictionary, corpus, lad_lsi_processing_time = test_instance.build_lda_lsi_model(
        token_list_after_feature_selection,
        min_topic_num=2,
        max_topic_num=5,
        model='lda')
    assert len(model_topics) <= 5 and len(model_topics) >= 2

    assert len(dictionary.id2token
               ) == 12  # all the words ever appear in the token list
    assert dictionary.id2token[0] == 'here' and dictionary.id2token[
        1] == 'one'  #{0: u'here', 1: u'one', 2: u'pattern', 3: u'positive', 4: u'tweet',...}
    assert dictionary.num_docs == 4

    assert len(corpus) == 4  # the length of the token list
    assert corpus[0] == [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
def test_feature_selection():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    posi_training_token_list = [[
        u'positive', u'tweet', u'one', u'pattern', u'here'
    ], [u'the', u'second', u'token', u'pattern', u'here']]
    nega_training_token_list = [[
        u'negative', u'token', u'one', u'pattern', u'here'
    ], [u'some', u'random', u'things', u'pattern', u'here']]

    X_chi_matrix, feature_name_list_after_chi, token_list_after_chi2, ch2, feature_selection_processing_time = test_instance.feature_selection(
        posi_training_token_list, nega_training_token_list, top_n_feature=5)

    assert X_chi_matrix.shape == (4, 5)
    assert feature_name_list_after_chi == [
        u'negative', u'positive', u'second', u'the', u'tweet'
    ]
    assert token_list_after_chi2 == [[u'positive', u'tweet'],
                                     [u'second', u'the'], [u'negative'], []]
def test_prepare_data_for_topic_modelling():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    training_posi_resampled_token_list, training_nega_resampled_token_list, resampling_processing_time = test_instance.data_resampling(
        test_instance.posi_training_data_df,
        test_instance.nega_training_data_df)
    training_token_list = test_instance.prepare_data_for_topic_modelling(
        training_posi_resampled_token_list, training_nega_resampled_token_list)

    assert len(
        training_token_list) == len(training_posi_resampled_token_list) + len(
            training_nega_resampled_token_list)
    assert all(
        len(training_token_list[i]) <= len((
            training_posi_resampled_token_list +
            training_nega_resampled_token_list)[i])
        for i in range(len(training_token_list)))
def test_init():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    assert isinstance(test_instance.posi_training_data_list, list)
    assert isinstance(test_instance.posi_training_data_tokens, list)
    assert isinstance(test_instance.posi_training_data_labels, list)
    assert isinstance(test_instance.posi_training_data_df, DataFrame)

    assert test_instance.posi_training_data_list[0] == [[
        'hey', 'really', 'sorry', 'knocked', 'you', 'down', 'but', 'can',
        'pick', 'you', 'up', 'at'
    ], 1]
    assert test_instance.posi_training_data_tokens[0] == [
        'hey', 'really', 'sorry', 'knocked', 'you', 'down', 'but', 'can',
        'pick', 'you', 'up', 'at'
    ]
    assert test_instance.posi_training_data_labels[0] == 1
    assert test_instance.posi_training_data_df['tweets'][
        0] == 'hey really sorry knocked you down but can pick you up at'
    assert test_instance.posi_training_data_df['label'][0] == 0
def test_test_data_fit_in_model():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    X_chi_matrix, feature_name_list_after_feature_selection, token_list_after_feature_selection, ch2, feature_selection_processing_time = test_instance.feature_selection(
        test_instance.posi_test_data_tokens,
        test_instance.nega_test_data_tokens,
        top_n_feature=150)

    top_model, model_topics, highest_coherence_score, dictionary, corpus, lad_lsi_processing_time = test_instance.build_lda_lsi_model(
        token_list_after_feature_selection,
        min_topic_num=3,
        max_topic_num=6,
        model='lda')

    tweet_topic_distribution_df = test_instance.get_tweet_topic_matrix_based_on_best_topic_model(
        top_model, corpus)

    list_k, lable_list, model_list, collect_clustering_info_processing_time = test_instance.collect_clustering_info(
        tweet_topic_distribution_df,
        min_cluster_number=2,
        max_cluster_number=10)

    tweet_topic_distribution_with_cluster_df, selected_kmeans_model, number_of_cluster, add_clustering_info_to_df_processing_time = test_instance.add_clustering_info_to_df(
        tweet_topic_distribution_df,
        list_k,
        lable_list,
        model_list,
        number_of_cluster=2)

    vectorizer_clf_dict, classifier_building_processing_time = test_instance.classifier_building(
        tweet_topic_distribution_with_cluster_df,
        number_of_cluster,
        token_list_after_feature_selection=token_list_after_feature_selection)

    restructured_X_test_df, cluster_label_list, test_data_fit_in_processing_time = test_instance.test_data_fit_in_model(
        vectorizer_clf_dict, top_model, dictionary, selected_kmeans_model)

    confusion_matrix, classification_report, accuracy_score = test_instance.evaluation(
        restructured_X_test_df)

    Y_test = restructured_X_test_df['label'].tolist()
    Y_pred = restructured_X_test_df['y_pred'].tolist()
    cluster_label_list = restructured_X_test_df['cluster_label'].tolist()

    assert len(Y_test) == len(Y_pred)
    assert len(Y_test) == len(cluster_label_list)

    assert isinstance(confusion_matrix, np.ndarray)
    assert isinstance(classification_report, unicode)
    assert isinstance(accuracy_score, np.float)
def test_add_clustering_info_to_df():

    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    tweet_topic_distribution_df = DataFrame(
        data={
            0: [0.135607, 0.898807, 0.882122, 0.895495, 0.10, 0.20],
            1: [0.864393, 0.101193, 0.117878, 0.104505, 0.90, 0.80]
        })

    list_k, lable_list, model_list, collect_clustering_info_processing_time = test_instance.collect_clustering_info(
        tweet_topic_distribution_df,
        min_cluster_number=1,
        max_cluster_number=3)

    tweet_topic_distribution_with_cluster_df, selected_kmeans_model, number_of_cluster, add_clustering_info_to_df_processing_time = test_instance.add_clustering_info_to_df(
        tweet_topic_distribution_df,
        list_k,
        lable_list,
        model_list,
        number_of_cluster=3)

    assert (tweet_topic_distribution_df[0] ==
            tweet_topic_distribution_with_cluster_df[0]).tolist()
    assert (tweet_topic_distribution_df[1] ==
            tweet_topic_distribution_with_cluster_df[1]).tolist()
    assert sorted(
        tweet_topic_distribution_with_cluster_df.columns.tolist()) == [
            0, 1, 'Y', 'clustering_labels'
        ]
    assert tweet_topic_distribution_with_cluster_df['Y'].tolist() == [
        1, 1, 1, 0, 0, 0
    ]
    assert len(
        tweet_topic_distribution_with_cluster_df['clustering_labels']) == 6
    assert number_of_cluster == 3
def test_bigram_or_unigram_extactor():

    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    posi_training_token_list = [[
        u'positive', u'tweet', u'one', u'pattern', u'here'
    ], [u'the', u'second', u'token', u'pattern', u'here']]
    nega_training_token_list = [[
        u'negative', u'token', u'one', u'pattern', u'here'
    ], [u'some', u'random', u'things', u'pattern', u'here']]

    # test_list = [['tweet','test','one','not','picked'],['one','pattern','here']]

    posi_unigran_training_token_list, nega_unigran_training_token_list, feature_extraction_processing_time = test_instance.bigram_or_unigram_extactor(
        posi_training_token_list,
        nega_training_token_list,
        bigram_min_count=3,
        mode='unigram')
    assert posi_unigran_training_token_list == posi_training_token_list
    assert nega_unigran_training_token_list == nega_training_token_list

    posi_training_token_list_with_unigram_and_bigram, nega_training_token_list_with_unigram_and_bigram, feature_extraction_processing_time = test_instance.bigram_or_unigram_extactor(
        posi_training_token_list,
        nega_training_token_list,
        bigram_min_count=2,
        threshold=1,
        mode='uni_and_bigram')
    assert posi_training_token_list_with_unigram_and_bigram == [[
        u'positive', u'tweet', u'one', u'pattern', u'here', u'pattern_here'
    ], [u'the', u'second', u'token', u'pattern', u'here', u'pattern_here']]
    assert nega_training_token_list_with_unigram_and_bigram == [[
        u'negative', u'token', u'one', u'pattern', u'here', u'pattern_here'
    ], [u'some', u'random', u'things', u'pattern', u'here', u'pattern_here']]
def test_to_string_list_tool():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    test_df = DataFrame(data={
        'tweet': ['This is tweet one', 'This is tweet two'],
        'others': [1, 2]
    })

    test_token_list = [['This', 'is', 'tweet', 'one'],
                       ['This', 'is', 'tweet', 'two']]

    token_list = test_instance.to_string_list_tool(
        input=test_df,
        tweet_column_name_in_df='tweet',
        mode='df_to_token_list')
    string_list_from_df = test_instance.to_string_list_tool(
        input=test_df,
        tweet_column_name_in_df='tweet',
        mode='df_to_string_list')
    string_list_from_token_list = test_instance.to_string_list_tool(
        input=test_token_list, mode='token_list_to_string_list')

    expected_token_list = [['This', 'is', 'tweet', 'one'],
                           ['This', 'is', 'tweet', 'two']]
    expected_string_list_from_df = ['This is tweet one', 'This is tweet two']
    expected_string_list_from_token_list = [
        'This is tweet one', 'This is tweet two'
    ]

    assert token_list == expected_token_list
    assert string_list_from_df == expected_string_list_from_df
    assert string_list_from_token_list == expected_string_list_from_token_list
def test_data_resampling():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    training_posi_resampled_token_list, training_nega_resampled_token_list, resampling_processing_time = test_instance.data_resampling(
        mode='r_under_s')

    assert isinstance(training_posi_resampled_token_list, list)
    assert isinstance(training_nega_resampled_token_list, list)
    assert len(training_posi_resampled_token_list) == len(
        training_nega_resampled_token_list)

    training_posi_resampled_token_list, training_nega_resampled_token_list, resampling_processing_time = test_instance.data_resampling(
        mode='r_upper_s')
    assert len(training_posi_resampled_token_list) == len(
        training_nega_resampled_token_list)
def test_baseline_test_data_fit_in_model():
    test_instance = topic_model_builder(
        training_dataset_posi_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_tweets_after_preprocessing.txt',
        training_dataset_nega_paths=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_tweets_after_preprocessing.txt',
        test_dataset_posi_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_positive_test_tweets_after_preprocessing.txt',
        test_dataset_nega_path=
        '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/after_preprocessing/test_negative_test_tweets_after_preprocessing.txt'
    )

    training_posi_resampled_token_list, training_nega_resampled_token_list, resampling_processing_time = test_instance.data_resampling(
        test_instance.posi_training_data_df,
        test_instance.nega_training_data_df)

    posi_bigram_training_token_list, nega_bigram_training_token_list, feature_extraction_processing_time = test_instance.bigram_or_unigram_extactor(
        training_posi_resampled_token_list,
        training_nega_resampled_token_list,
        mode='uni_and_bigram',
        bigram_min_count=3)

    X_chi_matrix, feature_name_list_after_feature_selection, token_list_after_feature_selection, ch2, feature_selection_processing_time = test_instance.feature_selection(
        posi_bigram_training_token_list,
        nega_bigram_training_token_list,
        top_n_feature=150)

    top_model, model_topics, highest_coherence_score, dictionary, corpus, lad_lsi_processing_time = test_instance.build_lda_lsi_model(
        token_list_after_feature_selection,
        min_topic_num=3,
        max_topic_num=6,
        model='lda')

    tweet_topic_distribution_df = test_instance.get_tweet_topic_matrix_based_on_best_topic_model(
        top_model, corpus)

    list_k, lable_list, model_list, collect_clustering_info_processing_time = test_instance.collect_clustering_info(
        tweet_topic_distribution_df,
        min_cluster_number=2,
        max_cluster_number=10)

    tweet_topic_distribution_with_cluster_df, selected_kmeans_model, number_of_cluster, add_clustering_info_to_df_processing_time = test_instance.add_clustering_info_to_df(
        tweet_topic_distribution_df,
        list_k,
        lable_list,
        model_list,
        number_of_cluster=2)

    vectorizer_clf_dict, classifier_building_processing_time = test_instance.classifier_building(
        tweet_topic_distribution_with_cluster_df,
        number_of_cluster,
        token_list_after_feature_selection=token_list_after_feature_selection)

    restructured_X_test_df, cluster_label_list, test_data_fit_in_processing_time = test_instance.test_data_fit_in_model(
        vectorizer_clf_dict, top_model, dictionary, selected_kmeans_model)

    vectorizer, baseline_clf_dict, baseline_classifier_building_processing_time = test_instance.baseline_model_builder(
        token_list_after_feature_selection, mode='tfidf')

    restructured_X_test_df, baseline_clf_name_list, baseline_test_data_fit_in_processing_time = test_instance.baseline_test_data_fit_in_model(
        vectorizer, baseline_clf_dict, restructured_X_test_df)

    evaluation_dict = test_instance.baseline_evaluation(
        restructured_X_test_df, baseline_clf_name_list)

    assert evaluation_dict.keys() == ['logistic_regression', 'naive_bayes']

    assert isinstance(evaluation_dict['logistic_regression'][0],
                      np.ndarray)  #cm
    assert isinstance(evaluation_dict['logistic_regression'][1],
                      unicode)  #classification_report
    assert isinstance(evaluation_dict['logistic_regression'][2],
                      np.float)  #accuracy_score