Esempio n. 1
0
def test_clean_file():
    """test the clean file method"""
    text = TextCleaner()
    f = open("test_file.txt")
    text.clean_file(f)
    assert (text.text) == [[
        'a', 'bunch', 'of', 'cute', 'and', 'spooky', 'animals', 'are',
        'dropping', 'by.'
    ], ['pick', 'trick', 'or', 'treat.'], ['trick', 'COMMA', '', 'treat.'],
                           [
                               'by', 'mr', 'zeng', 'COMMA', 'mrs', 'liao',
                               'and', 'dr', 'zhang'
                           ]]
Esempio n. 2
0
def main():
    clean_text = TextCleaner()

    try:
        f = open(sys.argv[1])
        clean_text.clean_file(f)
    except FileNotFoundError:
        print("Can't find", sys.argv[1])
        return

    text = clean_text.text

    # Report top ten unigrams by frequency
    unigram = NgramFrequencies()
    print("Top 10 unigram:")
    for line in text:
        for char in line:
            unigram.add_item(char)
    print_output(unigram.frequency(10))

    # Report top ten bigrams by frequency
    # if word end with ".", then it cannot connect with the next word
    bigram = NgramFrequencies()
    print("Top 10 bigram:")
    for line in text:
        for i in range(len(line) - 1):
            if "." in line[i]:
                continue
            else:
                bi_pattern = line[i] + "_" + line[i + 1]
                bigram.add_item(bi_pattern)
    print_output(bigram.frequency(10))

    # Report top ten trigrams by frequency
    # if word itself and the next word end with "."
    # then they cannot form trigram
    trigram = NgramFrequencies()
    print("Top 10 trigram:")
    for line in text:
        for j in range(len(line) - 2):
            if "." in line[j] or "." in line[j + 1]:
                continue
            else:
                tri_pattern = line[j] + "_" + line[j + 1] + "_" + line[j + 2]
                trigram.add_item(tri_pattern)
    print_output(trigram.frequency(10))