Ejemplo n.º 1
0
# traverse the whole file, adding canonical forms of valid words into a
# dictionary counting the number of appearances.
d = dict()
for line in txtsrc:
    # get rid of ASCII em and en dashes
    line = (line.replace("---", " ")).replace("--", " ")

    for word in line.split():
        clean_word = clean(word)
        if clean_word == None:
            #ignore words that don't parse
            continue
        else:
            # add or update words that do parse
            incr(clean_word,d)

# if we're not reading from a PDF, we have to close the file handle once
# we're done counting all the words. the other three settings close
# themselves.
if not (args.pdf or args.gutenberg):
    txtsrc.close()

# abort if the query makes no sense. note that we can't check this until we
# build the dictionary: it depends on the number of unique words.
if args.number > len(d):
    raise Exception('trying to compute the ' + str(args.number) +
                    ' most used words, but there are only ' + str(len(d)) +
                    ' unique words in the corpus')

# print out the answer, with more or less verbosity
Ejemplo n.º 2
0
def incr_if_jj(d, word):
    # skip the tags that nltk introduces for position in a context
    if( not(start.search(word) or end.search(word))):
        tagged = nltk.pos_tag([word])
        if tagged[0][1] == "JJ":
            incr (tagged[0][0], d)