def freq_baseline(wordcloud=False, venn=False): basicConfig(format='%(levelname)s %(message)s', level=INFO) nouns_files = check_for_nouns_files() for filename in nouns_files: nouns = post_processing(filename) freq = FreqDist(nouns) frequencies = freq.most_common() minimum_count = 10 candidate_aspect_terms = [] for_venn = [] for frequency in frequencies: if frequency[1] >= minimum_count: string = '{}: {}'.format(frequency[0], frequency[1]) for_venn.append(frequency[0]) candidate_aspect_terms.append(string) corpus = str.split(filename, '_')[0] candidate_file = 'aspect_terms_{}'.format(filename) pp.save_to_file(candidate_file, candidate_aspect_terms) if venn: venn_file = 'venn_{}'.format(filename) pp.save_to_file(venn_file, for_venn) if wordcloud: words_for_wordcloud(corpus, frequencies, nouns)
def words_for_wordcloud(corpus, frequencies, nouns): terms = [] for frequency in frequencies: if frequency[1] >= 10: terms.append(frequency[0]) words = [] for noun in nouns: if noun in terms: words.append(noun) words = ' '.join(words) pp.save_to_file('wordcloud_{}.txt'.format(corpus), words)
def write_nouns_to_file(result, corpus, tag, filename): words = [] for r in result: if r[1] == tag: words.append(r[0]) print('\nwriting nouns to file...') if len(str.split(filename, '/')) > 1: filename = str.split(filename, '/')[len(str.split(filename, '/')) - 1] pp.save_to_file('{}_nouns_{}'.format(corpus, filename), words)
def write_semicolon_nouns(tokens_list, tagger, corpus, corpus_tag, filename): nouns_per_tweet = [] result_csv = [] for tokens in tokens_list: nouns_per_tweet.append(tagger.tag(tokens)) for itens in nouns_per_tweet: aux = [] for item in itens: if corpus_tag in item: aux.append(item[0]) result_csv.append(';'.join(aux)) pp.save_to_file('{}_semicolon_{}'.format(corpus, filename), result_csv)