Ejemplo n.º 1
0
def get_words_bigrams_frequency(docs, path):
    fdist = FreqDist()
    for doc in docs:
        for word in doc:
            fdist[word] += 1
    for word in fdist.most_common():
        append_row_to_text_file((word[0] + ": " + str(word[1])), path)
Ejemplo n.º 2
0
def get_words_frequency(data, path):
    fdist = FreqDist()
    for text in data.Submission_Text:
        for word in word_tokenize(text):
            fdist[word] += 1
    for word in fdist.most_common():
        append_row_to_text_file((word[0] + ": " + str(word[1])), path)
    return fdist
Ejemplo n.º 3
0
def get_words_frequency2(dtm, path):
    word_counts = {}
    for word in dtm.columns.tolist():
        total = dtm[word].sum()
        word_counts.update({word: total})

    sorted_dict = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
    for k, v in sorted_dict.items():
        append_row_to_text_file(str(k + ": " + str(v)), path)
        # print(k + ": " + str(v))
    return word_counts
Ejemplo n.º 4
0
def get_top_10_words_each_comment(tdm, path):
    top_10_words = {}
    for word in tdm.columns:
        top = tdm[word].sort_values(ascending=False).head(10)
        top_10_words[word] = list(zip(top.index, top.values))
        rs = str(
            word + ':\n'
            + str(top_10_words[word])
        )
        append_row_to_text_file(rs, path)
    # print(top_10_words)
    return top_10_words
Ejemplo n.º 5
0
def print_topic_coherence_to_text_file(text_file_path, num_topics, lda_model, corpus, path_dict):
    create_file(text_file_path)
    # top_topics = lda_model.top_topics(corpus)
    # avg_topic_coherence = sum([topic[1] for topic in top_topics]) / num_topics
    # append_row_to_text_file(
    #     str('Average topic coherence: %.9f\n' % avg_topic_coherence),
    #     text_file_path
    # )

    dictionary = Dictionary.load(path_dict)
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass')
    coherence_lda = coherence_model_lda.get_coherence()
    append_row_to_text_file(str('Coherence Score: %.9f\n' % coherence_lda),
                            text_file_path)
Ejemplo n.º 6
0
def calculate_mean_std_deviation(raw_dataset_csv_path, corpus_path,
                                 dictionary_path, output_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)  # Bag of words
    dictionary = Dictionary.load(dictionary_path)

    raw_dataset = pandas.read_csv(raw_dataset_csv_path)
    raw_words_count = 0
    for text in raw_dataset[
            'Submission_Text']:  # Get words count from raw data
        raw_words_count += len(text.split())
    raw_mean = raw_words_count / len(raw_dataset['Submission_Text'])

    words_count = 0
    for word_id, word_count in dictionary.cfs.items(
    ):  # Get words count from cleaned data
        words_count += word_count
    mean = words_count / len(corpus)

    raw_std_deviation = 0  # standard deviation (from raw data)
    for text in raw_dataset['Submission_Text']:
        x = len(text.split())
        raw_std_deviation += (x - raw_mean) * (x - raw_mean)
    raw_std_deviation /= len(raw_dataset['Submission_Text'])
    raw_std_deviation = math.sqrt(raw_std_deviation)

    std_deviation = 0  # standard deviation (from cleaned data)
    for doc in corpus:
        x = 0  # word count for doc
        for word in doc:
            x += word[1]
        std_deviation += (x - mean) * (x - mean)
    std_deviation /= len(corpus)
    std_deviation = math.sqrt(std_deviation)

    if not os.path.exists(output_path):
        create_file(output_path)
    rs_text = f'raw_mean = {raw_mean}\traw_stdDeviation = {raw_std_deviation}\n' \
              f'mean = {mean}\tstd_deviation = {std_deviation}'
    append_row_to_text_file(string=rs_text,
                            path=output_path)  # Output to txt file
Ejemplo n.º 7
0
def print_topics_to_text_file(lda_model, text_file_path, num_words):
    for topic in lda_model.show_topics(num_words=num_words):
        rs = str('Topic: ' + str(topic[0] + 1) + '\n'
                 + str(topic[1]))
        append_row_to_text_file(rs, text_file_path)
Ejemplo n.º 8
0
def print_perplexity_to_text_file(text_file_path, lda_model, corpus):
    append_row_to_text_file(str('Log Perplexity: %.9f\n' % lda_model.log_perplexity(corpus)),
                            text_file_path)