def count_words_unigram_pos(input_filename, output_path=''): txt = get_file_text(input_filename) word_regex = '[a-zA-Z]+' word_frequency = {} total_words = 0. matches = re.findall(word_regex, txt, re.M + re.S + re.U) for m in matches: word_frequency[m] = word_frequency.get(m, 0.) + 1. total_words+=1. sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1)) word_analysis = [] for word in sorted_words: pos = pos_tag([word[0]]) word_analysis.append([word[0], word[1], pos[0][1]]) o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos') o_file.write('word\tcount\tpos\n') for w in word_analysis: o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2])) o_file.close()
def count_words_v0(input_filename, output_path=''): txt = get_file_text(input_filename) words = txt.split(' ') #test with nltk word_frequency = {} for w in words: word_frequency[w] = word_frequency.get(w, 0) + 1 sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1)) o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words') for w in sorted_words: o_file.write('%s\t%d\n' % (w[0], w[1])) o_file.close()