コード例 #1
0
ファイル: parse.py プロジェクト: Deadlyelder/BlockPerf
def _parse(writer, log_file, name, chunk, parsers):
    parsed_objects = {}
    for line in Chunker.parse(Chunker.read(log_file, chunk)):
        for parser in parsers:
            try:
                parsed_object = parser.from_log_line(line, name)
                parsed_objects.setdefault(parsed_object.file_name, []).append(parsed_object)
                break
            except ParseException:
                pass

    for key in parsed_objects:
        writer.append_csv(key, parsed_objects[key])
    logging.info('Parsing the type of tick {}'
                 .format(len(parsed_objects)))
コード例 #2
0
def parseInput(input,
               parser="RD",
               showAllLevels=False,
               getWordDefinitions=True):
    "parse input string into list of parsed contained sentence structures"
    # parser can be RD for recusrsive descent (currently the most-developed) or "NLTK" for the original NLTK chunking-grammar parser

    # clean & build a string for the KHaiii phoneme analyzer
    input = input.strip()
    if input[-1] not in ['.', '?', '!']:
        input += '.'
    input = re.sub(
        r'\s+([\.\?\;\,\:])', r'\1',
        input)  # elide spaces preceding clause endings, throws Khaiii off
    # input = input.replace(',', ' , ').replace(';', ' ; ').replace(':', ' : ') - adding a space before punctuation seems to mess tagging in Khaiii
    log("* parse {0}".format(input))

    # run Khaiii, grab the parts-of-speech list it generates (morphemes + POS tags) and extract original word-to-morpheme groupings
    sentences = []  # handle possible multiple sentences
    posList = []
    morphemeGroups = []
    for w in khaiiiAPI.analyze(input):
        morphemeGroups.append(
            [w.lex, [m.lex for m in w.morphs if m.tag != 'SF']])
        for m in w.morphs:
            posList.append('{0}:{1}'.format(m.lex.strip(), m.tag))
            if m.tag == 'SF':
                # sentence end, store extractions & reset for possible next sentence
                sentences.append(
                    dict(posList=posList,
                         morphemeGroups=morphemeGroups,
                         posString=';'.join(posList)))
                posList = []
                morphemeGroups = []

    for s in sentences:
        # map POS through synthetic tag mapper & extract word groupings
        mappedPosList, morphemeGroups = TagMap.mapTags(
            s['posString'], s['morphemeGroups'])  #, disableMapping=True)
        log("  {0}".format(s['posString']))
        log("  mapped to {0}".format(mappedPosList))

        if parser == "NLTK":  # NLTK chunking parser
            # perform chunk parsing
            chunkTree = Chunker.parse(mappedPosList, trace=2)
            chunkTree.pprint()
            # apply any synthetic-tag-related node renamings
            TagMap.mapNodeNames(chunkTree)
            # extract popup wiki definitions & references links & notes for implicated nodes
            references = TagMap.getReferences(chunkTree)
            # build descriptive phrase list
            phrases = Chunker.phraseList(chunkTree)
            #
            parseTreeDict = buildParseTree(chunkTree,
                                           showAllLevels=showAllLevels)

        else:  # recursive-descent parser
            from rd_grammar import KoreanParser
            parser = KoreanParser([":".join(p) for p in mappedPosList])
            parseTree = parser.parse(verbose=1)
            if parseTree:
                # apply any synthetic-tag-related node renamings
                parseTree.mapNodeNames()
                # extract popup wiki definitions & references links & notes for implicated nodes
                references = parseTree.getReferences()
                # build descriptive phrase list
                phrases = parseTree.phraseList()
                # get noun & verb translations from Naver
                wordDefs = getWordDefs(
                    mappedPosList) if getWordDefinitions else {}
                # build JSONable parse-tree dict
                parseTreeDict = parseTree.buildParseTree(
                    wordDefs=wordDefs, showAllLevels=showAllLevels)
                log("  {0}".format(parseTree))
            else:
                # parsing failed, return unrecognized token
                parseTree = references = parseTreeDict = phrases = None
                s.update(
                    dict(error="Sorry, failed to parse sentence",
                         lastToken=parser.lastTriedToken()))
                log("  ** failed.  Unexpected token {0}".format(
                    parser.lastTriedToken()))

        # format debugging daat
        debugging = dict(posList=pformat(s['posList']),
                         mappedPosList=pformat(mappedPosList),
                         phrases=pformat(phrases),
                         morphemeGroups=pformat(morphemeGroups),
                         parseTree=pformat(parseTreeDict),
                         references=references)

        # add parsing results to response structure
        s.update(
            dict(mappedPosList=mappedPosList,
                 morphemeGroups=morphemeGroups,
                 parseTree=parseTreeDict,
                 references=references,
                 phrases=phrases,
                 debugging=debugging))
    #
    return sentences
コード例 #3
0
def parse():
    "parse POSTed Korean sentence"
    # grab sentence to parse
    sentence = request.form.get('sentence')
    if not sentence:
        return jsonify(result="FAIL", msg="Missing sentence")

    # build a string for the KHaiii phoneme analyzer
    if sentence.strip()[-1] not in ['.', '?', '!']:
        sentence += '.'

    # run Khaiii
    words = []
    for w in khaiiiAPI.analyze(sentence):
        for m in w.morphs:
            words.append('{0}:{1}'.format(m.lex.strip(), m.tag))
    posString = ';'.join(words)

    # map POS through synthetic tag mapper
    mappedPosList = TagMap.mapTags(posString)

    # perform chunk parsing
    chunkTree = Chunker.parse(mappedPosList)

    # apply any synthetic-tag-related node renamings
    TagMap.mapNodeNames(chunkTree)

    # build descriptive phrase list
    phrases = Chunker.phraseList(chunkTree)

    # recursively turn the chunk tree into a Python nested dict for the JSON response
    def asDict(chunk):
        while isinstance(chunk, nltk.Tree) and len(chunk) == 1:
            # flatten degenerate tree nodes
            chunk = chunk[0]
        if isinstance(chunk, nltk.Tree):
            return dict(
                type='tree',
                tag='Sentence' if chunk.label() == 'S' else chunk.label(),
                children=[asDict(t) for t in chunk])
        else:
            return dict(type='pos', word=chunk[0].strip(), tag=chunk[1])

    #
    parseTree = asDict(chunkTree)
    debugging = dict(posList=pformat(words),
                     mappedPosList=pformat(mappedPosList),
                     phrases=pformat(phrases),
                     parseTree=pformat(parseTree))

    return jsonify(result="OK",
                   posList=words,
                   mappedPosList=mappedPosList,
                   phrases=phrases,
                   parseTree=parseTree,
                   debugging=debugging)

    if False:

        # synthetic tag patterns -
        #    patterns of these word:POC strings are preprocessed to define new
        #    synthetic word:POC tags used in the chunking grammar below
        #  at present, these are applied in the order longest-to-shortest pattern, we should probably make this a listfor explicit ordering

        tagMappings = {
            r'들:(TM|XSN)': r'들:PLU',  # pluralizer
            r'기:(ETN|NNG)': r'기:GNOM',  # nominalizer
            r'(ㄴ|는|ㄹ):ETM;것:NNB': r'\1 것:GNOM',  # nominalizer
            r'(은|는):JX':
            r'\1:JKS',  # turn topic-marking partcile into subject-marker (I think this is right??)
            r'(ㄹ|을|를):ETM;거:NNB;이:VCP':
            r'\1 거 이:FUT',  # ㄹ/를 거 이다 future-tense conjugator (hack!)
            r'전:NNG;에:JKB': r'전에:BEF',  # before
            r'때문:NNB;에:JKB': r'때문에:BEC',  # because
            r'및:MAG': r'및:ALS',  # also connector (why is it an adverb??)
            r'또는:MAG':
            r'또는:ALT',  # alternative connector (why is it an adverb??)
            r'에:JKB;(대하|관하):VV;([^:]+):EC':
            r'에 \1\2:PRP',  # preposition "about
        }

        # prepositional phrase suffix-patterns  (generate a <PRPP> pos-tag + metadata to label the parsing

        #    tag-pattern                  replacement          subtree name-mapping             reference links
        # (r'전:NNG;에:JKB',              r'전에:PRP',      "PrepositionalPhrase:Before",  ("ttmik:lessons/level-3-lesson-10", "htsk:unit1/unit-1-lessons-17-25-2/lesson-24/#242"))     # before

        # generate a version of the parser's original word:POC string including synthetic tag mappings above
        tagString = ';' + posString + ';'
        for old, new in sorted(tagMappings.items(),
                               key=lambda x: len(x[0]),
                               reverse=True):
            tagString = re.sub(';' + old + ';', ';' + new + ';', tagString)
        mappedWords = [
            tuple(pos.split(':')) for pos in tagString.strip(';').split(';')
        ]

        # Korean phrase NLTK chunking grammar

        grammar = r"""
    
            HadaVerb:           {<NN.*><XSV>}
            AuxiliaryVerb:      {<EC><VX|VV>}
            Adverb:             {<MAG>}
            NominalizedVerb:    {<VV|HadaVerb><EP|FUT>*<GNOM>}
            Adjective:          {<Adverb>*<VA><ETM>}
            DescriptiveVerb:    {<VA>}
            Verb:               {<VV|VCN|HadaVerb|DescriptiveVerb>}
            VerbSuffix:         {<EP|FUT>*<EF|EC>}
    
            Location:           {<JKB>}
            Title:              {<XSN>}
        
            Preposition:        {<PRP>}
            
            Noun:               {<NN.*|NR|SL>}       
            Pronoun:            {<NP>}
            Substantive:        {<Noun><Noun>*}
                                {<Pronoun>}
                                {<NominalizedVerb>}            
            NounPhrase:         {<XPN>*<MAG>*<Adjective>*<Substantive><Title>*<Location>*<PLU>*<JX>*<Preposition>*}
            
            Possessive:         {<NounPhrase><JKG><NounPhrase>}
            Component:          {<NounPhrase|Possessive><JC|ALS|ALT>}
            Connection:         {<Component><Component>*<NounPhrase|Possessive>}
            
            Constituent:        {<NounPhrase|Possessive|Connection>}
        
            Complement:         {<Constituent><JKC>} 
            Object:             {<Constituent><JKO>}  
            Subject:            {<Constituent><JKS>}
            
            Before:             {<Constituent|Object|Subject>*<Constituent><BEF>}
            Because:            {<Constituent|Object|Subject>*<Constituent><BEC>}
            
            Copula:             {<Constituent><Adverb>*<VCP><AuxiliaryVerb>*<VerbSuffix>}
            Predicate:          {<Adverb>*<Verb><AuxiliaryVerb>*<VerbSuffix>}
    
            """

        # Component: { < NounPhrase > < JC | ALS > }
        # Connection: { < Component > < Component > * < NounPhrase > }
        # Possessive: { < NounPhrase | Connection > < JKG > < NounPhrase | Connection > }

        # Constituent:        {<Subject|Object|Complement>}
        # Clause:             {<Constituent>*<Predicate>}
        # Sentence:           {<Clause><Clause>*}

        # gen chunk tree from the word-POS list under the above chunking grammar
        parser = nltk.RegexpParser(grammar, trace=1)
        print(parser._stages)
        chunkTree = parser.parse(mappedWords)

        # heuristic subtree simplifications
        # toss sentence end node
        if not isinstance(chunkTree[-1],
                          nltk.Tree) and chunkTree[-1][1] == 'SF':
            chunkTree.remove(chunkTree[-1])
        # flatten connection trees
        def flattenConnections(t):
            for st in t:
                if isinstance(st, nltk.Tree):
                    if st.label() == 'Connection':
                        # if Connection node, pull up component tuples into one long connection sequence
                        for i, c in enumerate(list(st)[:-1]):
                            st[2 * i] = c[0]
                            st.insert(2 * i + 1, c[1])
                    else:
                        flattenConnections(st)

        flattenConnections(chunkTree)

        # generate phrase-descriptors from top-level subtrees
        hiddenTags = {
            'Substantive',
            'Constituent',
            'NounPhrase',
            'Connection',
        }

        def flattenPhrase(t, phrase):
            for st in t:
                if isinstance(st, nltk.Tree):
                    phrase = flattenPhrase(st, phrase)
                    if st.label() not in hiddenTags:
                        phrase.append({"type": 'label', "word": st.label()})
                else:
                    phrase.append({
                        "type": 'word',
                        "word": st[0].strip(),
                        "tag": st[1]
                    })  # st[1][0] if st[1][0] in ('N', 'V') else st[0].strip()
            return phrase

        #
        phrases = []
        for t in chunkTree:
            if isinstance(t, nltk.Tree):
                phrase = flattenPhrase(t, [])
                if t.label() not in hiddenTags:
                    phrase.append({"type": 'label', "word": t.label()})
                phrases.append(phrase)
            else:
                phrases.append(('word', t[0].strip()))
        for p in phrases:
            print(p)

        # recursively turn the chunk tree into a Python nested dict for the JSON response
        def asDict(chunk):
            while isinstance(chunk, nltk.Tree) and len(chunk) == 1:
                # flatten degenerate tree nodes
                chunk = chunk[0]
            if isinstance(chunk, nltk.Tree):
                return dict(
                    type='tree',
                    tag='Sentence' if chunk.label() == 'S' else chunk.label(),
                    children=[asDict(t) for t in chunk])
            else:
                return dict(type='pos', word=chunk[0].strip(), tag=chunk[1])

        #
        parseTree = asDict(chunkTree)
        debugging = dict(posList=pformat(words),
                         mappedPosList=pformat(mappedWords),
                         phrases=pformat(phrases),
                         parseTree=pformat(parseTree))

        return jsonify(result="OK",
                       posList=words,
                       mappedPosList=mappedWords,
                       phrases=phrases,
                       parseTree=parseTree,
                       debugging=debugging)