so_far = (sortpl(so_far))[:k] return so_far # check while parsing the arguments that the number given is a nat. def nat (string) : value = int(string) if value < 0: msg = "%r is not a natural number" % string raise argparse.ArgumentTypeError(msg) return value ## describe and parse arguments from the command line parser = argparse.ArgumentParser(description="Count the n most-used words in" + "a corpus of English text.") baseargs (parser) parser.add_argument("-n" , "--number", help="number of most frequently used words to compute. " + "defaults to 4.", type=nat, default=4) args = parser.parse_args() txtsrc = opentext (args.pdf , args.gutenberg , args.filename) # traverse the whole file, adding canonical forms of valid words into a # dictionary counting the number of appearances. d = dict() for line in txtsrc: # get rid of ASCII em and en dashes line = (line.replace("---", " ")).replace("--", " ")
print "looking for adjectives near nouns requires that you install nltk" exit(1) # check if a string is an English noun def noun (string): tag = nltk.pos_tag([string]) if not(tag[0][1] == 'NN' or tag[0][1] == 'NNP'): msg = "%r is not a noun according to nltk" % string raise argparse.ArgumentTypeError(msg) return string # describe and parse arguments from the command line parser = argparse.ArgumentParser(description="Count the number of times a " + "noun appears after an adjective in a " + "corpus of English text.") baseargs(parser, "nicer output if you have progressbar installed.") parser.add_argument("noun", help="the noun for which to search", type=noun) args = parser.parse_args() txtsrc = opentext (args.pdf, args.gutenberg, args.filename) # dump the corpus into a string so nltk can tokenize it corpus = "" for line in txtsrc: corpus = corpus + (line.lower()) if not (args.pdf or args.gutenberg): txtsrc.close() if args.verbose: print "read in text"