def get_words_bigrams_frequency(docs, path): fdist = FreqDist() for doc in docs: for word in doc: fdist[word] += 1 for word in fdist.most_common(): append_row_to_text_file((word[0] + ": " + str(word[1])), path)
def get_words_frequency(data, path): fdist = FreqDist() for text in data.Submission_Text: for word in word_tokenize(text): fdist[word] += 1 for word in fdist.most_common(): append_row_to_text_file((word[0] + ": " + str(word[1])), path) return fdist
def get_words_frequency2(dtm, path): word_counts = {} for word in dtm.columns.tolist(): total = dtm[word].sum() word_counts.update({word: total}) sorted_dict = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)} for k, v in sorted_dict.items(): append_row_to_text_file(str(k + ": " + str(v)), path) # print(k + ": " + str(v)) return word_counts
def get_top_10_words_each_comment(tdm, path): top_10_words = {} for word in tdm.columns: top = tdm[word].sort_values(ascending=False).head(10) top_10_words[word] = list(zip(top.index, top.values)) rs = str( word + ':\n' + str(top_10_words[word]) ) append_row_to_text_file(rs, path) # print(top_10_words) return top_10_words
def print_topic_coherence_to_text_file(text_file_path, num_topics, lda_model, corpus, path_dict): create_file(text_file_path) # top_topics = lda_model.top_topics(corpus) # avg_topic_coherence = sum([topic[1] for topic in top_topics]) / num_topics # append_row_to_text_file( # str('Average topic coherence: %.9f\n' % avg_topic_coherence), # text_file_path # ) dictionary = Dictionary.load(path_dict) temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass') coherence_lda = coherence_model_lda.get_coherence() append_row_to_text_file(str('Coherence Score: %.9f\n' % coherence_lda), text_file_path)
def calculate_mean_std_deviation(raw_dataset_csv_path, corpus_path, dictionary_path, output_path): with open(corpus_path, 'rb') as f: corpus = pickle.load(f) # Bag of words dictionary = Dictionary.load(dictionary_path) raw_dataset = pandas.read_csv(raw_dataset_csv_path) raw_words_count = 0 for text in raw_dataset[ 'Submission_Text']: # Get words count from raw data raw_words_count += len(text.split()) raw_mean = raw_words_count / len(raw_dataset['Submission_Text']) words_count = 0 for word_id, word_count in dictionary.cfs.items( ): # Get words count from cleaned data words_count += word_count mean = words_count / len(corpus) raw_std_deviation = 0 # standard deviation (from raw data) for text in raw_dataset['Submission_Text']: x = len(text.split()) raw_std_deviation += (x - raw_mean) * (x - raw_mean) raw_std_deviation /= len(raw_dataset['Submission_Text']) raw_std_deviation = math.sqrt(raw_std_deviation) std_deviation = 0 # standard deviation (from cleaned data) for doc in corpus: x = 0 # word count for doc for word in doc: x += word[1] std_deviation += (x - mean) * (x - mean) std_deviation /= len(corpus) std_deviation = math.sqrt(std_deviation) if not os.path.exists(output_path): create_file(output_path) rs_text = f'raw_mean = {raw_mean}\traw_stdDeviation = {raw_std_deviation}\n' \ f'mean = {mean}\tstd_deviation = {std_deviation}' append_row_to_text_file(string=rs_text, path=output_path) # Output to txt file
def print_topics_to_text_file(lda_model, text_file_path, num_words): for topic in lda_model.show_topics(num_words=num_words): rs = str('Topic: ' + str(topic[0] + 1) + '\n' + str(topic[1])) append_row_to_text_file(rs, text_file_path)
def print_perplexity_to_text_file(text_file_path, lda_model, corpus): append_row_to_text_file(str('Log Perplexity: %.9f\n' % lda_model.log_perplexity(corpus)), text_file_path)