import nltk import sys from parser import CKYParser if __name__ == "__main__": if (len(sys.argv) == 2): sent = sys.argv[0] cnf = sys.argv[1] else: sent = "sentences.txt" cnf = "grammar.cnf" grammar = nltk.data.load("file:grammar.cnf") sentences = open(sent, 'r') parser = CKYParser(grammar) # initialize the counting sentCount = 0; parsCount = 0; totalParsCount = 0; for sentence in sentences: # reads each sentence tokens = nltk.tokenize.wordpunct_tokenize(sentence) # makes each sentence splitted sentCount += 1 # counts the number of sentences in this example parses = parser.nbest_parse(tokens) # parses the given sentence to get parse trees that represent possible structures for the given sentence for parse in parses: # prints the parse trees print str(parse) + "\n" parsCount += 1 # counts the number of parses for the sentence
sentence_path = sys.argv[1] except IndexError: exit("Please give a path to a file of sentences.") try: grammar_path = sys.argv[2] except IndexError: exit("Please give a path to a file with a grammar.") with open(sentence_path) as sentence_file: sentence_data = sentence_file.read() sentences = sentence_data.strip().split("\n") sentences = [nltk.wordpunct_tokenize(sentence) for sentence in sentences] parser = CKYParser(grammar_path) delimiter = 40 * "-" metadata = ["Kathryn Nichols", "Stefan Behr", "LING 571", "Project 1"] print "\n".join("# {0}".format(datum) for datum in metadata) print delimiter # iterate over tokenized sentences, parse each # and output parses for sentence in sentences: sentence = ["'{0}'".format(token) for token in sentence ] # make sure tokens are single-quoted parses = parser.get_parses(sentence) sys.stdout.write(print_parses(parses)) print len(parses)
sentence_path = sys.argv[1] except IndexError: exit("Please give a path to a file of sentences.") try: grammar_path = sys.argv[2] except IndexError: exit("Please give a path to a file with a grammar.") with open(sentence_path) as sentence_file: sentence_data = sentence_file.read() sentences = sentence_data.strip().split("\n") sentences = [nltk.wordpunct_tokenize(sentence) for sentence in sentences] parser = CKYParser(grammar_path) delimiter = 40 * "-" metadata = ["Kathryn Nichols", "Stefan Behr", "LING 571", "Project 1"] print "\n".join("# {0}".format(datum) for datum in metadata) print delimiter # iterate over tokenized sentences, parse each # and output parses for sentence in sentences: sentence = ["'{0}'".format(token) for token in sentence] # make sure tokens are single-quoted parses = parser.get_parses(sentence) sys.stdout.write(print_parses(parses)) print len(parses) print delimiter