コード例 #1
0
ファイル: hw2.py プロジェクト: hehaotian/CKYParser
import nltk
import sys
from parser import CKYParser

if __name__ == "__main__":

    if (len(sys.argv) == 2):
        sent = sys.argv[0]
        cnf = sys.argv[1]
    else:
        sent = "sentences.txt"
        cnf = "grammar.cnf"

    grammar = nltk.data.load("file:grammar.cnf")
    sentences = open(sent, 'r')
    parser = CKYParser(grammar)
    
    # initialize the counting
    sentCount = 0;
    parsCount = 0;
    totalParsCount = 0;
    
    for sentence in sentences: # reads each sentence
    
        tokens = nltk.tokenize.wordpunct_tokenize(sentence) # makes each sentence splitted
        sentCount += 1 # counts the number of sentences in this example
        
        parses = parser.nbest_parse(tokens) # parses the given sentence to get parse trees that represent possible structures for the given sentence
        for parse in parses: # prints the parse trees
            print str(parse) + "\n"
            parsCount += 1 # counts the number of parses for the sentence
コード例 #2
0
        sentence_path = sys.argv[1]
    except IndexError:
        exit("Please give a path to a file of sentences.")

    try:
        grammar_path = sys.argv[2]
    except IndexError:
        exit("Please give a path to a file with a grammar.")

    with open(sentence_path) as sentence_file:
        sentence_data = sentence_file.read()

    sentences = sentence_data.strip().split("\n")
    sentences = [nltk.wordpunct_tokenize(sentence) for sentence in sentences]

    parser = CKYParser(grammar_path)

    delimiter = 40 * "-"
    metadata = ["Kathryn Nichols", "Stefan Behr", "LING 571", "Project 1"]

    print "\n".join("# {0}".format(datum) for datum in metadata)
    print delimiter

    # iterate over tokenized sentences, parse each
    # and output parses
    for sentence in sentences:
        sentence = ["'{0}'".format(token) for token in sentence
                    ]  # make sure tokens are single-quoted
        parses = parser.get_parses(sentence)
        sys.stdout.write(print_parses(parses))
        print len(parses)
コード例 #3
0
ファイル: parse.py プロジェクト: Blick-winkel/cky
		sentence_path = sys.argv[1]
	except IndexError:
		exit("Please give a path to a file of sentences.")

	try:
		grammar_path = sys.argv[2]
	except IndexError:
		exit("Please give a path to a file with a grammar.")

	with open(sentence_path) as sentence_file:
		sentence_data = sentence_file.read()

	sentences = sentence_data.strip().split("\n")
	sentences = [nltk.wordpunct_tokenize(sentence) for sentence in sentences]

	parser = CKYParser(grammar_path)

        delimiter = 40 * "-"
        metadata = ["Kathryn Nichols", "Stefan Behr", "LING 571", "Project 1"]

        print "\n".join("# {0}".format(datum) for datum in metadata)
        print delimiter

	# iterate over tokenized sentences, parse each
	# and output parses
	for sentence in sentences:
		sentence = ["'{0}'".format(token) for token in sentence]	# make sure tokens are single-quoted
		parses = parser.get_parses(sentence)
		sys.stdout.write(print_parses(parses))
		print len(parses)
                print delimiter