Python WordCounter.add_word Examples

Programming Language: Python

Namespace/Package Name: word_counter

Class/Type: WordCounter

Method/Function: add_word

Examples at hotexamples.com: 2

Python WordCounter.add_word - 2 examples found. These are the top rated real world Python examples of word_counter.WordCounter.add_word extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

WordCounter(26)

processString(6)

count(5)

getWordCount(5)

count_word(4)

getMaxValues(2)

getWordList(2)

add_word(2)

text_content(2)

finalize(1)

display(1)

count_words(1)

countWords(1)

get_count(1)

get_most_common_words(1)

get_most_frequent(1)

get_total_words(1)

classify_sentence(1)

run(1)

start(1)

getTopOccurrences(1)

Example #1

Show file

File: main.py Project: kail/csep517

def main():
    logging.info("Starting...")

    training_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.train.json')
    dev_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json')
    test_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.test.json')

    # First, count the words!
    counter = WordCounter()
    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            for i in range(1, len(parsed_sentence) - 1):
                counter.add_word(parsed_sentence[i][0])

    # Finalize counter and separate high frequency from low frequency
    counter.finalize()

    # Initialize the models
    bigram = BigramHMM()
    trigram = TrigramHMM()

    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            bigram.add_sentence(parsed_sentence)
            trigram.add_sentence(parsed_sentence)

    # Models have been initialized at this point, finalize the distributions
    #bigram.finalize()
    trigram.finalize()

    # PICK THE PARSER HERE
    parser = dev_parser

    # Iterate over data and try to predict
    num_correct_bigram = 0
    num_correct_trigram = 0
    total_words = 0
    for parsed_sentence in parser.get_tokenized_sentences():
        if parsed_sentence:
            original_sentence = copy.deepcopy(parsed_sentence)

            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            # Bigram lattice
            #lattice = Lattice(bigram, parsed_sentence)

            # Trigram lattice
            tri_lattice = TrigramLattice(trigram, parsed_sentence)

            # Calculate best POS using viterbi
            #pos_list_bigram = lattice.get_pos()
            pos_list_trigram = tri_lattice.get_pos()

            # Determine how many were correct
            #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice)
            num_correct_trigram += get_num_correct(parsed_sentence,
                                                   pos_list_trigram,
                                                   tri_lattice,
                                                   original_sentence, counter)

            # Remove the START and STOP chars
            total_words += (len(parsed_sentence) - 2)

            print("Accuracy: %s" % (num_correct_trigram / total_words))
        else:
            print('ERROR! Couldnt parse sentence')

    print("Bigram HMM Accuracy: %s/%s - %s" %
          (num_correct_bigram, total_words,
           (num_correct_bigram / total_words)))
    print("Trigram HMM Accuracy: %s/%s - %s" %
          (num_correct_trigram, total_words,
           (num_correct_trigram / total_words)))

Example #2

Show file

import re
from word_counter import WordCounter

# pattern for splitting words
PATTERN = "[\d\W\s_]+"

wc = WordCounter()

with open('README.md', 'r') as file:
    for line in file:
        for word in re.split(PATTERN, line.strip()):
            if word:
                wc.add_word(word)
               
# display the word counts
for word in sorted(wc):
    print(f"{wc[word]:3}  {word}")