Beispiel #1
0
def Experiment2():
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data('Data/hns_2018_2019.csv')
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e2_get_count(training_data_story)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e2_get_count(training_data_ask_hn)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e2_get_count(training_data_show_hn)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e2_get_count(training_data_poll)

    total_word = story_total_word + ask_hn_total_word + show_hn_total_word + poll_total_word
    store_p, ask_hn_p, show_hn_p, poll_p = Decimal(story_total_word) / Decimal(total_word), \
                                           Decimal(ask_hn_total_word) / Decimal(total_word), \
                                           Decimal(show_hn_total_word) / Decimal(total_word), \
                                           Decimal(poll_total_word) / Decimal(total_word)
    vocabulary_list = func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                               show_hn_vocabulary_list, poll_vocabulary_list)

    remove_list = func.get_remove_list(story_remove_list, ask_hn_remove_list,
                                       show_hn_remove_list, poll_remove_list)

    func.save_file('wordlength/wordlength_vocabulary.txt', vocabulary_list)
    func.save_file('wordlength/wordlength_remove.txt', remove_list)

    p_story, p_ask_hn, p_show_hn, p_poll = func.create_model(vocabulary_list, story_count, story_total_word,
                                                             story_vocabulary_list, ask_hn_count, ask_hn_total_word,
                                                             ask_hn_vocabulary_list, show_hn_count, show_hn_total_word,
                                                             show_hn_vocabulary_list, poll_count, poll_total_word,
                                                             poll_vocabulary_list, 'wordlength/wordlength-model.txt')
    func.test_model(vocabulary_list, test_set, p_story, p_ask_hn, p_show_hn, p_poll, store_p, ask_hn_p, show_hn_p,
                    poll_p, "wordlength/wordlength-result.txt")
Beispiel #2
0
def main():
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.get_count(training_data_story)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.get_count(training_data_ask_hn)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.get_count(training_data_show_hn)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.get_count(training_data_poll)

    total_word = story_total_word + ask_hn_total_word + show_hn_total_word + poll_total_word
    store_p, ask_hn_p, show_hn_p, poll_p = Decimal(story_total_word) / Decimal(total_word), \
                                           Decimal(ask_hn_total_word) / Decimal(total_word), \
                                           Decimal(show_hn_total_word) / Decimal(total_word), \
                                           Decimal(poll_total_word) / Decimal(total_word)
    vocabulary_list = func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                               show_hn_vocabulary_list, poll_vocabulary_list)

    remove_list = func.get_remove_list(story_remove_list, ask_hn_remove_list,
                                       show_hn_remove_list, poll_remove_list)

    func.save_file('baseline/vocabulary.txt', vocabulary_list)
    func.save_file('baseline/remove.txt', remove_list)

    p_story, p_ask_hn, p_show_hn, p_poll = func.create_model(vocabulary_list, story_count, story_total_word,
                                                             story_vocabulary_list, ask_hn_count, ask_hn_total_word,
                                                             ask_hn_vocabulary_list, show_hn_count, show_hn_total_word,
                                                             show_hn_vocabulary_list, poll_count, poll_total_word,
                                                             poll_vocabulary_list, 'baseline/model-2018.txt')
    func.test_model(vocabulary_list, test_set, p_story, p_ask_hn, p_show_hn, p_poll, store_p, ask_hn_p, show_hn_p,
                    poll_p, "baseline/baselineresult.txt")
Beispiel #3
0
def Experiment1():
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    stop_words_list = func.read_stop_words('Data/stopwords.txt')
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e1_get_count(training_data_story, stop_words_list)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e1_get_count(training_data_ask_hn, stop_words_list)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e1_get_count(training_data_show_hn, stop_words_list)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e1_get_count(training_data_poll, stop_words_list)

    total_word = story_total_word + ask_hn_total_word + show_hn_total_word + poll_total_word
    store_p, ask_hn_p, show_hn_p, poll_p = Decimal(story_total_word) / Decimal(total_word), \
                                           Decimal(ask_hn_total_word) / Decimal(total_word), \
                                           Decimal(show_hn_total_word) / Decimal(total_word), \
                                           Decimal(poll_total_word) / Decimal(total_word)
    vocabulary_list = func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                               show_hn_vocabulary_list, poll_vocabulary_list)

    remove_list = func.get_remove_list(story_remove_list, ask_hn_remove_list,
                                       show_hn_remove_list, poll_remove_list)

    func.save_file('stopword/stopword_vocabulary.txt', vocabulary_list)
    func.save_file('stopword/stopword_remove.txt', remove_list)

    p_story, p_ask_hn, p_show_hn, p_poll = func.create_model(vocabulary_list, story_count, story_total_word,
                                                             story_vocabulary_list, ask_hn_count, ask_hn_total_word,
                                                             ask_hn_vocabulary_list, show_hn_count, show_hn_total_word,
                                                             show_hn_vocabulary_list, poll_count, poll_total_word,
                                                             poll_vocabulary_list, 'stopword/stopword-model.txt')
    func.test_model(vocabulary_list, test_set, p_story, p_ask_hn, p_show_hn, p_poll, store_p, ask_hn_p, show_hn_p,
                    poll_p, "stopword/stopword-result.txt")
Beispiel #4
0
def Experiment3_2():
    # part 2
    threshold5 = 0.05
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold5)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold5)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold5)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold5)
    len_vocabulary_list_5 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                         show_hn_vocabulary_list, poll_vocabulary_list))

    threshold10 = 0.1
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold10)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold10)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold10)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold10)
    len_vocabulary_list_10 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold15 = 0.15
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold15)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold15)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold15)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold15)
    len_vocabulary_list_15 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold20 = 0.2
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold20)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold20)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold20)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold20)
    len_vocabulary_list_20 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold25 = 0.25
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold25)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold25)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold25)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold25)
    len_vocabulary_list_25 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    plt.plot([threshold5, threshold10, threshold15, threshold20, threshold25],
             [len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15, len_vocabulary_list_20,
              len_vocabulary_list_25], 'ro')
    plt.axis([0, 1, 0, max(len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15) + 50])
    plt.ylabel('number of vocabulary left')
    plt.xlabel('threshold')
    plt.title("2")
    plt.show()
Beispiel #5
0
def Experiment3_1():
    # part 1
    threshold1 = 1
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold1)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold1)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold1)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold1)
    len_vocabulary_list_1 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                         show_hn_vocabulary_list, poll_vocabulary_list))

    threshold5 = 5
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold5)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold5)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold5)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold5)
    len_vocabulary_list_5 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                         show_hn_vocabulary_list, poll_vocabulary_list))

    threshold10 = 10
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold10)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold10)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold10)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold10)
    len_vocabulary_list_10 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold15 = 15
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold15)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold15)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold15)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold15)
    len_vocabulary_list_15 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    # plot for
    plt.plot([threshold1, threshold5, threshold10, threshold15],
             [len_vocabulary_list_1, len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15], 'ro')
    plt.axis([0, 20, 0,
              max(len_vocabulary_list_1, len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15) + 50])
    plt.ylabel('number of vocabulary left')
    plt.xlabel('threshold')
    plt.title("1")
    plt.show()