def _parse(writer, log_file, name, chunk, parsers): parsed_objects = {} for line in Chunker.parse(Chunker.read(log_file, chunk)): for parser in parsers: try: parsed_object = parser.from_log_line(line, name) parsed_objects.setdefault(parsed_object.file_name, []).append(parsed_object) break except ParseException: pass for key in parsed_objects: writer.append_csv(key, parsed_objects[key]) logging.info('Parsing the type of tick {}' .format(len(parsed_objects)))
def parseInput(input, parser="RD", showAllLevels=False, getWordDefinitions=True): "parse input string into list of parsed contained sentence structures" # parser can be RD for recusrsive descent (currently the most-developed) or "NLTK" for the original NLTK chunking-grammar parser # clean & build a string for the KHaiii phoneme analyzer input = input.strip() if input[-1] not in ['.', '?', '!']: input += '.' input = re.sub( r'\s+([\.\?\;\,\:])', r'\1', input) # elide spaces preceding clause endings, throws Khaiii off # input = input.replace(',', ' , ').replace(';', ' ; ').replace(':', ' : ') - adding a space before punctuation seems to mess tagging in Khaiii log("* parse {0}".format(input)) # run Khaiii, grab the parts-of-speech list it generates (morphemes + POS tags) and extract original word-to-morpheme groupings sentences = [] # handle possible multiple sentences posList = [] morphemeGroups = [] for w in khaiiiAPI.analyze(input): morphemeGroups.append( [w.lex, [m.lex for m in w.morphs if m.tag != 'SF']]) for m in w.morphs: posList.append('{0}:{1}'.format(m.lex.strip(), m.tag)) if m.tag == 'SF': # sentence end, store extractions & reset for possible next sentence sentences.append( dict(posList=posList, morphemeGroups=morphemeGroups, posString=';'.join(posList))) posList = [] morphemeGroups = [] for s in sentences: # map POS through synthetic tag mapper & extract word groupings mappedPosList, morphemeGroups = TagMap.mapTags( s['posString'], s['morphemeGroups']) #, disableMapping=True) log(" {0}".format(s['posString'])) log(" mapped to {0}".format(mappedPosList)) if parser == "NLTK": # NLTK chunking parser # perform chunk parsing chunkTree = Chunker.parse(mappedPosList, trace=2) chunkTree.pprint() # apply any synthetic-tag-related node renamings TagMap.mapNodeNames(chunkTree) # extract popup wiki definitions & references links & notes for implicated nodes references = TagMap.getReferences(chunkTree) # build descriptive phrase list phrases = Chunker.phraseList(chunkTree) # parseTreeDict = buildParseTree(chunkTree, showAllLevels=showAllLevels) else: # recursive-descent parser from rd_grammar import KoreanParser parser = KoreanParser([":".join(p) for p in mappedPosList]) parseTree = parser.parse(verbose=1) if parseTree: # apply any synthetic-tag-related node renamings parseTree.mapNodeNames() # extract popup wiki definitions & references links & notes for implicated nodes references = parseTree.getReferences() # build descriptive phrase list phrases = parseTree.phraseList() # get noun & verb translations from Naver wordDefs = getWordDefs( mappedPosList) if getWordDefinitions else {} # build JSONable parse-tree dict parseTreeDict = parseTree.buildParseTree( wordDefs=wordDefs, showAllLevels=showAllLevels) log(" {0}".format(parseTree)) else: # parsing failed, return unrecognized token parseTree = references = parseTreeDict = phrases = None s.update( dict(error="Sorry, failed to parse sentence", lastToken=parser.lastTriedToken())) log(" ** failed. Unexpected token {0}".format( parser.lastTriedToken())) # format debugging daat debugging = dict(posList=pformat(s['posList']), mappedPosList=pformat(mappedPosList), phrases=pformat(phrases), morphemeGroups=pformat(morphemeGroups), parseTree=pformat(parseTreeDict), references=references) # add parsing results to response structure s.update( dict(mappedPosList=mappedPosList, morphemeGroups=morphemeGroups, parseTree=parseTreeDict, references=references, phrases=phrases, debugging=debugging)) # return sentences
def parse(): "parse POSTed Korean sentence" # grab sentence to parse sentence = request.form.get('sentence') if not sentence: return jsonify(result="FAIL", msg="Missing sentence") # build a string for the KHaiii phoneme analyzer if sentence.strip()[-1] not in ['.', '?', '!']: sentence += '.' # run Khaiii words = [] for w in khaiiiAPI.analyze(sentence): for m in w.morphs: words.append('{0}:{1}'.format(m.lex.strip(), m.tag)) posString = ';'.join(words) # map POS through synthetic tag mapper mappedPosList = TagMap.mapTags(posString) # perform chunk parsing chunkTree = Chunker.parse(mappedPosList) # apply any synthetic-tag-related node renamings TagMap.mapNodeNames(chunkTree) # build descriptive phrase list phrases = Chunker.phraseList(chunkTree) # recursively turn the chunk tree into a Python nested dict for the JSON response def asDict(chunk): while isinstance(chunk, nltk.Tree) and len(chunk) == 1: # flatten degenerate tree nodes chunk = chunk[0] if isinstance(chunk, nltk.Tree): return dict( type='tree', tag='Sentence' if chunk.label() == 'S' else chunk.label(), children=[asDict(t) for t in chunk]) else: return dict(type='pos', word=chunk[0].strip(), tag=chunk[1]) # parseTree = asDict(chunkTree) debugging = dict(posList=pformat(words), mappedPosList=pformat(mappedPosList), phrases=pformat(phrases), parseTree=pformat(parseTree)) return jsonify(result="OK", posList=words, mappedPosList=mappedPosList, phrases=phrases, parseTree=parseTree, debugging=debugging) if False: # synthetic tag patterns - # patterns of these word:POC strings are preprocessed to define new # synthetic word:POC tags used in the chunking grammar below # at present, these are applied in the order longest-to-shortest pattern, we should probably make this a listfor explicit ordering tagMappings = { r'들:(TM|XSN)': r'들:PLU', # pluralizer r'기:(ETN|NNG)': r'기:GNOM', # nominalizer r'(ㄴ|는|ㄹ):ETM;것:NNB': r'\1 것:GNOM', # nominalizer r'(은|는):JX': r'\1:JKS', # turn topic-marking partcile into subject-marker (I think this is right??) r'(ㄹ|을|를):ETM;거:NNB;이:VCP': r'\1 거 이:FUT', # ㄹ/를 거 이다 future-tense conjugator (hack!) r'전:NNG;에:JKB': r'전에:BEF', # before r'때문:NNB;에:JKB': r'때문에:BEC', # because r'및:MAG': r'및:ALS', # also connector (why is it an adverb??) r'또는:MAG': r'또는:ALT', # alternative connector (why is it an adverb??) r'에:JKB;(대하|관하):VV;([^:]+):EC': r'에 \1\2:PRP', # preposition "about } # prepositional phrase suffix-patterns (generate a <PRPP> pos-tag + metadata to label the parsing # tag-pattern replacement subtree name-mapping reference links # (r'전:NNG;에:JKB', r'전에:PRP', "PrepositionalPhrase:Before", ("ttmik:lessons/level-3-lesson-10", "htsk:unit1/unit-1-lessons-17-25-2/lesson-24/#242")) # before # generate a version of the parser's original word:POC string including synthetic tag mappings above tagString = ';' + posString + ';' for old, new in sorted(tagMappings.items(), key=lambda x: len(x[0]), reverse=True): tagString = re.sub(';' + old + ';', ';' + new + ';', tagString) mappedWords = [ tuple(pos.split(':')) for pos in tagString.strip(';').split(';') ] # Korean phrase NLTK chunking grammar grammar = r""" HadaVerb: {<NN.*><XSV>} AuxiliaryVerb: {<EC><VX|VV>} Adverb: {<MAG>} NominalizedVerb: {<VV|HadaVerb><EP|FUT>*<GNOM>} Adjective: {<Adverb>*<VA><ETM>} DescriptiveVerb: {<VA>} Verb: {<VV|VCN|HadaVerb|DescriptiveVerb>} VerbSuffix: {<EP|FUT>*<EF|EC>} Location: {<JKB>} Title: {<XSN>} Preposition: {<PRP>} Noun: {<NN.*|NR|SL>} Pronoun: {<NP>} Substantive: {<Noun><Noun>*} {<Pronoun>} {<NominalizedVerb>} NounPhrase: {<XPN>*<MAG>*<Adjective>*<Substantive><Title>*<Location>*<PLU>*<JX>*<Preposition>*} Possessive: {<NounPhrase><JKG><NounPhrase>} Component: {<NounPhrase|Possessive><JC|ALS|ALT>} Connection: {<Component><Component>*<NounPhrase|Possessive>} Constituent: {<NounPhrase|Possessive|Connection>} Complement: {<Constituent><JKC>} Object: {<Constituent><JKO>} Subject: {<Constituent><JKS>} Before: {<Constituent|Object|Subject>*<Constituent><BEF>} Because: {<Constituent|Object|Subject>*<Constituent><BEC>} Copula: {<Constituent><Adverb>*<VCP><AuxiliaryVerb>*<VerbSuffix>} Predicate: {<Adverb>*<Verb><AuxiliaryVerb>*<VerbSuffix>} """ # Component: { < NounPhrase > < JC | ALS > } # Connection: { < Component > < Component > * < NounPhrase > } # Possessive: { < NounPhrase | Connection > < JKG > < NounPhrase | Connection > } # Constituent: {<Subject|Object|Complement>} # Clause: {<Constituent>*<Predicate>} # Sentence: {<Clause><Clause>*} # gen chunk tree from the word-POS list under the above chunking grammar parser = nltk.RegexpParser(grammar, trace=1) print(parser._stages) chunkTree = parser.parse(mappedWords) # heuristic subtree simplifications # toss sentence end node if not isinstance(chunkTree[-1], nltk.Tree) and chunkTree[-1][1] == 'SF': chunkTree.remove(chunkTree[-1]) # flatten connection trees def flattenConnections(t): for st in t: if isinstance(st, nltk.Tree): if st.label() == 'Connection': # if Connection node, pull up component tuples into one long connection sequence for i, c in enumerate(list(st)[:-1]): st[2 * i] = c[0] st.insert(2 * i + 1, c[1]) else: flattenConnections(st) flattenConnections(chunkTree) # generate phrase-descriptors from top-level subtrees hiddenTags = { 'Substantive', 'Constituent', 'NounPhrase', 'Connection', } def flattenPhrase(t, phrase): for st in t: if isinstance(st, nltk.Tree): phrase = flattenPhrase(st, phrase) if st.label() not in hiddenTags: phrase.append({"type": 'label', "word": st.label()}) else: phrase.append({ "type": 'word', "word": st[0].strip(), "tag": st[1] }) # st[1][0] if st[1][0] in ('N', 'V') else st[0].strip() return phrase # phrases = [] for t in chunkTree: if isinstance(t, nltk.Tree): phrase = flattenPhrase(t, []) if t.label() not in hiddenTags: phrase.append({"type": 'label', "word": t.label()}) phrases.append(phrase) else: phrases.append(('word', t[0].strip())) for p in phrases: print(p) # recursively turn the chunk tree into a Python nested dict for the JSON response def asDict(chunk): while isinstance(chunk, nltk.Tree) and len(chunk) == 1: # flatten degenerate tree nodes chunk = chunk[0] if isinstance(chunk, nltk.Tree): return dict( type='tree', tag='Sentence' if chunk.label() == 'S' else chunk.label(), children=[asDict(t) for t in chunk]) else: return dict(type='pos', word=chunk[0].strip(), tag=chunk[1]) # parseTree = asDict(chunkTree) debugging = dict(posList=pformat(words), mappedPosList=pformat(mappedWords), phrases=pformat(phrases), parseTree=pformat(parseTree)) return jsonify(result="OK", posList=words, mappedPosList=mappedWords, phrases=phrases, parseTree=parseTree, debugging=debugging)