def parseNow(sentence_file): """Parse sentences from sentences.txt with preprocessing and postprocessing, printing them out one by one Arguments: none Returns: none, prints out list of trees """ #[TODO] THIS ATTEMPTS TO CREATE A FEATURE GRAMMAR IN UTF8 - BUT IS GIVING PARSE ERRORS IN NLTK.GRAMMAR #grammar = nltk.grammar.parse_fcfg(codecs.open('gaelic.fcfg', 'rb', encoding='utf8').read()) #THIS WORKS BUT WITHOUT THE UTF8 ENCODING grammar = nltk.data.load('file:gaelic.fcfg') cp = nltk.FeatureChartParser(grammar) sentences = gaelicProc.preprocessSentences(sentence_file) for sentence in sentences: #this is where the first-pass correction could be applied i.e. sentence = filterOrthography(sentence) rev = '\n' + gaelicProc.postprocessSentences(sentence) #or add .decode('utf8') if that's desired, the trees print in ASCII on a PC anyway print rev #print out reverted orthographic form of the sentence trees = cp.nbest_parse(sentence.split()) if not trees: print 'CHA GHABH AN ROSGRANN PARSADH.' else: for tree in trees: print tree
def testParse(sentence_file): """Parse sentences from sentences.txt with preprocessing and postprocessing, printing them out one by one Arguments: none Returns: none, prints out list of trees """ grammar = nltk.data.load('file:gaelic.fcfg') cp = nltk.FeatureChartParser(grammar) sentences = gaelicProc.preprocessSentences(sentence_file) parse_errors = [] single_parses = [] multiple_parses = 0 print "\n\n" + sentence_file for sentence in sentences: trees = cp.nbest_parse(sentence.split()) if not trees: parse_errors = parse_errors + [sentence] #track parse errors else: if len(trees) > 1: #report sentences with multiple parses print '\n' + gaelicProc.postprocessSentences(sentence).decode("utf8") #print out reverted orthographic form of the sentence for tree in trees: print tree multiple_parses += 1 else: single_parses = single_parses + [sentence] #track parse successes if multiple_parses == 0: print "CHA ROBH ROSGRAINN LE BARRACHD IS 1 TORADH PARSAIDH ANN" else: print "BARRACHD IS 1 TORADH PARSAIDH: " + str(multiple_parses) #report parse statistics print "\nROSGRAINN UILE: " + str(len(sentences)) #display sentences with single parses print "\n1 TORADH PARSAIDH A-MHÀIN: " + str(len(single_parses)) for sentence in single_parses: print gaelicProc.postprocessSentences(sentence).decode("utf8") #display sentences with parse errors print "\nCHA GHABH PARSADH: " + str(len(parse_errors)) for sentence in parse_errors: print gaelicProc.postprocessSentences(sentence).decode("utf8")