コード例 #1
0
ファイル: gaelic.py プロジェクト: wojtekdz/gd-fcfg
def parseNow(sentence_file):
    """Parse sentences from sentences.txt with preprocessing and postprocessing, printing them out one by one

    Arguments: none
    Returns: none, prints out list of trees

    """
    #[TODO] THIS ATTEMPTS TO CREATE A FEATURE GRAMMAR IN UTF8 - BUT IS GIVING PARSE ERRORS IN NLTK.GRAMMAR
    #grammar = nltk.grammar.parse_fcfg(codecs.open('gaelic.fcfg', 'rb', encoding='utf8').read())

    #THIS WORKS BUT WITHOUT THE UTF8 ENCODING
    grammar = nltk.data.load('file:gaelic.fcfg')
    cp = nltk.FeatureChartParser(grammar)

    sentences = gaelicProc.preprocessSentences(sentence_file)

    for sentence in sentences:
        #this is where the first-pass correction could be applied i.e. sentence = filterOrthography(sentence)
        rev = '\n' + gaelicProc.postprocessSentences(sentence) #or add .decode('utf8') if that's desired, the trees print in ASCII on a PC anyway
        print rev #print out reverted orthographic form of the sentence
        trees = cp.nbest_parse(sentence.split())
        if not trees: print 'CHA GHABH AN ROSGRANN PARSADH.'
        else:
            for tree in trees:
                    print tree
コード例 #2
0
ファイル: gaelicTest.py プロジェクト: wojtekdz/gd-fcfg
def testParse(sentence_file):
    """Parse sentences from sentences.txt with preprocessing and postprocessing, printing them out one by one

    Arguments: none
    Returns: none, prints out list of trees

    """
    grammar = nltk.data.load('file:gaelic.fcfg')
    cp = nltk.FeatureChartParser(grammar)

    sentences = gaelicProc.preprocessSentences(sentence_file)


    parse_errors = []
    single_parses = []
    multiple_parses = 0

    print "\n\n" + sentence_file

    for sentence in sentences:
        trees = cp.nbest_parse(sentence.split())

        if not trees:
            parse_errors = parse_errors + [sentence] #track parse errors
        else:
            if len(trees) > 1: #report sentences with multiple parses
                print '\n' + gaelicProc.postprocessSentences(sentence).decode("utf8") #print out reverted orthographic form of the sentence
                for tree in trees:
                    print tree
                multiple_parses += 1
            else:
                single_parses = single_parses + [sentence] #track parse successes

    if multiple_parses == 0:
        print "CHA ROBH ROSGRAINN LE BARRACHD IS 1 TORADH PARSAIDH ANN"
    else:
        print "BARRACHD IS 1 TORADH PARSAIDH: " + str(multiple_parses)

    #report parse statistics
    print "\nROSGRAINN UILE: " + str(len(sentences))

    #display sentences with single parses
    print "\n1 TORADH PARSAIDH A-MHÀIN: " + str(len(single_parses))
    for sentence in single_parses:
        print gaelicProc.postprocessSentences(sentence).decode("utf8")

    #display sentences with parse errors
    print "\nCHA GHABH PARSADH: " + str(len(parse_errors))
    for sentence in parse_errors:
        print gaelicProc.postprocessSentences(sentence).decode("utf8")