Example #1
0
def count_words_unigram_pos(input_filename, output_path=''):

    txt = get_file_text(input_filename)

    word_regex = '[a-zA-Z]+'
    word_frequency = {}
    total_words = 0.

    matches = re.findall(word_regex, txt, re.M + re.S + re.U)
    for m in matches:
        word_frequency[m] = word_frequency.get(m, 0.) + 1.
        total_words+=1.

    sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))

    word_analysis = []
    for word in sorted_words:
        pos = pos_tag([word[0]])
        word_analysis.append([word[0], word[1], pos[0][1]])

    o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos')
    o_file.write('word\tcount\tpos\n')
    for w in word_analysis:
        o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2]))

    o_file.close()
Example #2
0
def count_words_v0(input_filename, output_path=''):

    txt = get_file_text(input_filename)
    words = txt.split(' ')
    #test with nltk
    word_frequency = {}
    for w in words:
        word_frequency[w] = word_frequency.get(w, 0) + 1

    sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))

    o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words')
    for w in sorted_words:
        o_file.write('%s\t%d\n' % (w[0], w[1]))

    o_file.close()