def parse_using_stanfordparser(tokenized_sent, display_tree=False, printNP=False, printLeave=False): result = stanford_parser.tagged_parse(tokenized_sent) for item in result: # print item if display_tree: Tree.draw(item) if printNP: NPs = list( Tree.subtrees( item, filter=lambda x: x.label() == 'NP' and x.height() <= 6)) for n in NPs: if printLeave: candidate = Tree.leaves(n) s = ' '.join(candidate) if len(candidate) == 1: if re.search(re.compile(r'[A-Z_-]+', re.X), s): print s else: print s else: tags = [] for t in Tree.subtrees(n): if t.label() not in ['NP', 'S', 'VP']: tags.append(t.label()) tagged = [] for w in range(len(Tree.leaves(n))): tagged.append( (Tree.leaves(n)[w], tags[w].encode('gbk'))) regexp_ner_m2(regexp_grammar, tagged)
def getSecondLvNPsOfParseTree(parse_tree, nps, display_tree=False): if display_tree: Tree.pretty_print(parse_tree) for subtree in parse_tree: if isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() == 3: np = subtree start_flag = "B-NP" print('\nNP: ' + ' '.join(Tree.leaves(np))) # obtained = False # may or may not be a terminal for np_derivation in Tree.subtrees(np): getSecondLvNPsOfParseTree(np_derivation, nps, False) if np_derivation.label() in penni_tags: # if not obtained: # print('\nNP: ' + ' '.join(Tree.leaves(np))) # nps.append(Tree.leaves(np)) # obtained = True print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag) start_flag = "I-NP" nps.append(Tree.leaves(np)) elif isinstance(subtree, Tree) and subtree.label() != 'NP': getSecondLvNPsOfParseTree(subtree, nps, False) elif isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() != 3: getSecondLvNPsOfParseTree(subtree, nps, False) else: # reach terminal pass
def generate_chunks(tagged_sent, expression=r'CHUNK: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'): chunks = [] chunkParser = RegexpParser(expression) try: if len(tagged_sent) == 0: tree = Tree('S', []) else: tree = chunkParser.parse(tagged_sent, trace=0) for subtree in tree.subtrees(): if subtree.label() == "CHUNK": chunks.append(subtree.leaves()) except ValueError: chunks = [] return chunks
def regexp_ner_m2(grammar_re, tagged_sentence): result = [] cp = nltk.RegexpParser(grammar_re) result_tree = cp.parse(tagged_sentence) nps = list( Tree.subtrees(result_tree, filter=lambda x: x.label() == 'NE' and x.height() <= 5)) if nps is not []: # print "(M2)NE found: " for n in nps: ne_list = [i[0] for i in Tree.leaves(n)] s = ' '.join(ne_list) result.append(s) # print s return result
def get_noun_phrases(text_list, tagger): noun_phrases = [] tagged_texts = [tagger.tag(text.split()) for text in text_list] expression = r'NOUN_PHRASE: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}' chunkParser = RegexpParser(expression) for tagged_sent in tagged_texts: try: if len(tagged_sent) == 0: tree = Tree('S', []) else: tree = chunkParser.parse(tagged_sent, trace=0) for subtree in tree.subtrees(): if subtree.label() == "NOUN_PHRASE": noun_phrases.append([el[0] for el in subtree.leaves()]) except ValueError: noun_phrases = [] return noun_phrases
def getFirstLvNPsOfParseTree(parse_tree, nps, display_tree=False): if display_tree: Tree.pretty_print(parse_tree) # print(Tree.leaf_treeposition(parser_tree, 1)) get a child index by leaves list index # print(parser_tree[(0, 0, 1,)]) get a tree by index for subtree in parse_tree: if isinstance(subtree, Tree) and subtree.label() == 'NP': np = subtree start_flag = "B-NP" print('\nNP: '+' '.join(Tree.leaves(np))) # may or may not be a terminal for np_derivation in Tree.subtrees(np): # below gets smaller np scope # getNPsOfParseTree(np_derivation, nps, False) if np_derivation.label() in penni_tags: print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag) start_flag = "I-NP" nps.append(Tree.leaves(np)) elif isinstance(subtree, Tree) and subtree.label() != 'NP': getFirstLvNPsOfParseTree(subtree, nps, False) else: # reach terminal pass
Tree('List', [ Tree('Item', ['f1']), Tree('List', [Tree('Item', ['f2']), Tree('List', [Tree('Item', ['f3'])])]) ]), 'to', Tree('Item', ['folder']) ]) print(t2) print(t2.flatten()) print(type(t2.flatten())) print(t2.collapse_unary()) max_subtree = Tree('', []) for subtree in t2.subtrees(filter=lambda x: x.label() == 'List'): if len(subtree.flatten().pos()) > len(max_subtree.pos()): max_subtree = subtree print(max_subtree) ''' tmp = t2 i = while tmp.label() != 'List': i += 1 tmp = tmp.pos() print("the " + str(i) + " time:") print(tmp) print(type(tmp)) tmp = tmp.flatten() print(t2)
'copy', Tree('List', [ Tree('Item', ['f1']), Tree('List', [ Tree('Item', ['f2']), Tree('List', [Tree('Item', ['f3'])])])]), 'to', Tree('Item', ['folder'])]) print(t2) print(t2.flatten()) print(type(t2.flatten())) print(t2.collapse_unary()) max_subtree = Tree('', []) for subtree in t2.subtrees(filter = lambda x: x.label() == 'List'): if len(subtree.flatten().pos()) > len(max_subtree.pos()): max_subtree = subtree print(max_subtree) ''' tmp = t2 i = while tmp.label() != 'List': i += 1 tmp = tmp.pos() print("the " + str(i) + " time:") print(tmp) print(type(tmp)) tmp = tmp.flatten()