def _create_parser(url): try: parser = CoreNLPParser(url=url) parser.raw_parse('This is a test sentence.') except Exception: parser = None return parser
class CNLP: CNLPServerURL = 'http://localhost:9000' def __init__(self): self.parser = CoreNLPParser(url=self.CNLPServerURL) self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL) self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner') self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos') def getParse(self, sentence): if (type(sentence) == list): return self.parser.parse(sentence) else: return self.parser.raw_parse(sentence) def getDepParse(self, sentence): if (type(sentence) == list): return self.dep_parser.parse(sentence) else: return self.dep_parser.raw_parse(sentence) def getNERTags(self, sentence): if (type(sentence) != list): sentence = sentence.split() return self.ner_tagger.tag(sentence) def getPOSTags(self, sentence): if (type(sentence) == list): return self.pos_tagger.parse(sentence) else: return self.pos_tagger.raw_parse(sentence)
class Parser: def __init__(self): self.parser = CoreNLPParser() self.parser.session.trust_env = False def parse(self, sentence): return self.parser.raw_parse(sentence)
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess): res = "" productions = [] parser = CoreNLPParser(url='http://localhost:9500') for sentence in re.split(r"[.!?]", review): try: tree = next(parser.raw_parse(sentence)) # Optimize by creating Chomsky normal form tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() except StopIteration: # End of review reached break S = Nonterminal('S') grammar = induce_pcfg(S, productions) count = 0 for line in str(grammar).split("\n"): if count == 0: count += 1 continue elif "'" in line: res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " " res += bipos.get_bigrams_and_unigrams_of_sentence( bow.sanitize_sentence(review, speller, stop_words, ps, preprocess)) return res
def parse_tree(self, s): parser = CoreNLPParser() parse = next(parser.raw_parse(s)) # parse.draw() return parse
class NLTK_NLP(): def __init__(self, ip_port): self.dep_parser = CoreNLPDependencyParser(url=ip_port) self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner') self.parser = CoreNLPParser(url=ip_port) self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos') def generate_dependency_tree(self, sentence): '''what is the name of the asteroid ?''' dependency_tree, = self.dep_parser.raw_parse(sentence=sentence) return dependency_tree def generate_dependency_graph(self, sentence): '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'} 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.''' dependency_tree, = self.dep_parser.raw_parse(sentence=sentence) return DependencyGraph(dependency_tree.to_conll(10)) def generate_constituency_tree(self, sentence): '''input: one question''' tree_list = list(self.parser.raw_parse(sentence=sentence)) return tree_list[0] def get_pos(self, sentence): '''What is the airspeed of an unladen swallow ? [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] ''' pos_list = list(self.pos_tagger.tag(sentence.split())) # tokens = nltk.word_tokenize(sentence) # wordpos = nltk.pos_tag(tokens) return pos_list def get_pos_by_tokens(self, tokens): '''What is the airspeed of an unladen swallow ?''' pos_list = list(self.pos_tagger.tag(tokens)) return pos_list def get_ner(self, sentence): # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split() '''april the 26th, 1882 is the birth date of which athletes ? [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'), ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'), ('athletes', 'O'), ('?', 'O')]''' sequence_ner_tuple_list = self.ner_parser.tag(sentence.split()) sequence_ner_list = [] for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list): sequence_ner_list.append(ner_tag) return sequence_ner_list def get_toknizer(self, sentence): return list(self.parser.tokenize(sentence)) def find_phrases(self, tree, phrase_tag='NP'): return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
def stanford_nlp(): parser = CoreNLPParser() text1 = "There is still a place for mercenaries working for NGOs." text2 = "The Rich Poor Gap Silences the Political Voice of the Poor" text3 = "Legislation against mercenaries" for text in [text1, text2, text3]: parse = next(parser.raw_parse(text)) print(parse) has_sent = False for item in parse.subtrees(): if item.label() == "S": has_sent = True print(has_sent)
def convert_text_tree(sentence): """ Converts a given sentence into a sentiment treebank like tree. :param sentence: String that needs to be converted. :return: String encoding tree structure. """ parser = CoreNLPParser() # Parse sentence in nltk tree nodes root, = next(parser.raw_parse(sentence)) # Recursively build text return get_node_text(root)
def parse_consituency_tree(sentence_list): pos_parent = [] right_sublings_list = [] chunk_position = [] sen = mergeWords(sentence_list) parser = CoreNLPParser(url="http://localhost:9000") parse, = parser.raw_parse(sen) parse.pretty_print() newtree = ParentedTree.convert(parse) leaf_values = newtree.leaves() for i, word in enumerate(sentence_list): index = find_closest_words(i, word, leaf_values) if index >= 0 and index < len(leaf_values): tree_location = newtree.leaf_treeposition(index) parent = newtree[tree_location[:-2]].label() pos_parent.append(parent) #####################find right_sibling########################### right_sibling = newtree[tree_location[:-1]].right_sibling() #count = calcuate_nodes((right_sibling)) if parent == "NP" and right_sibling is not None and calcuate_nodes( right_sibling) == 1: count = calcuate_nodes((right_sibling)) #print(count) right_sublings_list.append(right_sibling.leaves()[0]) else: right_sublings_list.append(" ") ###########################find chunk item position########################## height = newtree[tree_location[:-2]].height() #只处理最底层的NP tree_height == 3 if parent == "NP" and height == 3: chunk_item_list = newtree[tree_location[:-2]].leaves() print(newtree[tree_location[:-2]].height()) for i, item in enumerate(chunk_item_list): if item == leaf_values[index]: chunk_position.append(i + 1) break else: chunk_position.append(" ") else: pos_parent.append("null") right_sublings_list.append("null") chunk_position.append(" ") return pos_parent, right_sublings_list, chunk_position
class TreeParser: def __init__(self): self.parser = None self.server = None self.dependency_parser = None def setup(self): url = settings.CORENLP_URL if url is None: server = CoreNLPServer( settings.CORENLP_PATH, settings.CORENLP_MODEL_PATH, ) server.start() self.server = server url = server.url else: print("[TreeParser] Using existing CoreNLP Server...") self.parser = CoreNLPParser(url=url) # maybe separated with another class... self.dependency_parser = CoreNLPDependencyParser(url=url) return self.parser def parse(self, sentence): if not self.parser: raise AttributeError('parser is not set up') return self.parser.raw_parse(sentence) def free(self): if not self.server: return self.server.stop() def dependency_parse(self, sentence): if not self.dependency_parser: raise AttributeError('dependency parser is not set up') return self.dependency_parser.raw_parse(sentence)
class JeopardyParser(): def __init__(self, url='http://localhost:9000', encoding='utf8'): """Start the parsers to make sure they're running before calling. CoreNLP runs by default on port 9000, but if an external server is used or a different port is selected when started, the url will need to be explicitly passed. """ self.NERT = CoreNLPNERTagger(url=url) self.Parser = CoreNLPParser(url=url, encoding=encoding) self.dep_parser = DepParser(url=url) def tag(self, sentence): """Return the sentence after tagging named entities.""" sentence = self.clean_sentence(sentence) sentence = sentence.split() sentence = self.NERT.tag(sentence) return (sentence) def lexname(self, word, index=0): """Return the lexname entry for a word in WordNet.""" synset = wordnet.synsets(word) lex = synset[index].lexname() return (lex) def parse(self, sentence): """Return the syntactic Tree object for a sentence.""" sentence = self.clean_sentence(sentence) parse, = self.Parser.raw_parse(sentence) return (parse) def clean_sentence(self, sentence): """Remove backslash apostrophes from the data.""" s = sentence.replace("\\'", "'") return (s) def check_syntax(self, tree, labels=['SBARQ', 'WHNP']): """Return True if sentence type is not on provided in labels.""" if tree[0].label() not in labels: print('Malformed question', tree[0].label()) return (False) elif tree[0].label() in labels: return (True)
def test_leaves(): parser = CoreNLPParser(url="http://localhost:9000") parse, = parser.raw_parse( "we will collect user informaiton, and google user emails") sen_list = [ "we", "will", "collect", "user", "informaiton,", "and", "google", "user", "emails." ] parse.pretty_print() newtree = ParentedTree.convert(parse) leaf_values = newtree.leaves() for i, word in enumerate(sen_list): node_index = find_closest_words(i, word, leaf_values) tree_location = newtree.leaf_treeposition(node_index) print(i) print(word) print("---------------------") print(node_index) print(newtree[tree_location[:-1]].leaves()[0]) print("\n")
def get_parser_tree_from_phrase(phrase): #ini_path = "/stanford/jars" #os.environ['STANFORD_PARSER'] = ini_path #os.environ['STANFORD_MODELS'] = ini_path ''' parser = stanford.StanfordParser(model_path= ini_path + "/stanford-parser-3.9.2-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") parse_generator = parser.raw_parse(phrase) for line in parse_generator: parse_tree = line break ''' parse_tree = Tree(None, []) parser = CoreNLPParser(url=os.environ['STANFORD_NLP_TOOLS']) try: parse_generator = parser.raw_parse(phrase) for line in parse_generator: parse_tree = line break except: print('Something wrong when trying to get parser tree by Stanford Parser!') return parse_tree
def create_grammar_of_sample(review_type, sample_id): # DONT FORGET TO RUN THE STANFORD CORENLP SERVER BY RUNNING THIS JAVA COMMAND IN THE ROOT FOLDER: # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9500 -timeout 30000 if review_type == 'regular': reader = yelp.get_regular_balanced_sample_reader(sample_id) elif review_type == '45stars': reader = yelp.get_45stars_balanced_sample_reader(sample_id) elif review_type == '12stars': reader = yelp.get_12stars_balanced_sample_reader(sample_id) parser = CoreNLPParser(url='http://localhost:9500') productions = [] label, review = yelp.get_next_review_and_label(reader) while label != "-1": for sentence in re.split(r"[.!?]", review): try: tree = next(parser.raw_parse(sentence)) # Optimize by creating Chomsky normal form tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() except StopIteration: # End of review reached break label, review = yelp.get_next_review_and_label(reader) S = Nonterminal('S') grammar = induce_pcfg(S, productions) output = open('../grammars/sample_' + review_type + "_" + str(sample_id), 'wb') dump(grammar, output, -1) output.close()
def getParser(): parser = CoreNLPParser(url="http://localhost:9000") parse, = parser.raw_parse( "Do not interfere with, intercept, disrupt, filter, or disable any features of Google or the Twitter API, including the Twitter Content of embedded Tweets and embedded timelines" ) parse.pretty_print() newtree = ParentedTree.convert(parse) #for i, child in enumerate(newtree): #print(type(child)) leaf_values = newtree.leaves() print(leaf_values[12:]) if 'Twitter' in leaf_values[12:]: leaf_index = leaf_values.index('Twitter') tree_location = newtree.leaf_treeposition(leaf_index) print(tree_location[:-1]) #print (newtree[tree_location[:-2]].right_sibling().leaves()) #print (newtree[tree_location[:-2]].right_sibling().height()) #print ( calcuate_nodes(newtree[tree_location[:-1]].right_sibling())) #print(newtree[tree_location[:-1]].right_sibling().height()) #print (tree_location[:-2]) #print (newtree[tree_location[:-1]].label()) ''' path = [] for l in range(1,len(tree_location)): print(newtree[tree_location[:-l]].label()) path.append(newtree[tree_location[:-l]].label()) print(obtain_dash_list(path[::-1])) ''' #subtree.left_sibling() parent_tree = newtree[tree_location[:-2]].leaves() print(parent_tree)
class SVO(): def __init__(self, sentence): config = ApplicationConfig.get_corenlp_config() self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}") self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}") sentence = sentence.replace(' ', ' ') sentence = sentence.replace('.', '') self._load(sentence) self.original = sentence def get_dependency_tree(): return self._dependency def get_parser_tree(): return self.t def _load(self, sentence): self.t = list(self._parser.raw_parse(sentence))[0] self.t = ParentedTree.convert(self.t) def show(self): self.t.pretty_print() def find_svo(self): self._queue = [] # sentence須為S或NP才能找SVO & find conj for i in self.t.subtrees(lambda i: i.label() != 'ROOT'): # if i.label() in ['S','NP','SINV','SBAR','FRAG','X','PP']: remover = self._find_conj() # refresh for i in remover: self.original = self.original.replace(i, '') self._load(self.original) self.pos = self.t.pos() self._root = SVONode(('main', self.t), None) self._queue.append(self._root) break # else: # return 'Sentence can not find SVO.' # find SVO while self._queue != []: self._data = self._queue.pop(0) tmp = list(self._data.data.flatten()) if ',' in tmp: tmp.remove(',') if len(tmp) == 1: continue sentence = ' '.join(self._data.data.flatten()) self.t = self._data.data # 找子句 & 對等連接詞 & 分詞 # self.show() if self._data.relation != 'appos': self._find_SBAR() # self.show() # self._remove_comma() # self.show() self._data.svo = collections.defaultdict(list) # Find Subject tmp = self._find_subject() if isinstance(tmp, list): self._data.svo['subject'] = tmp else: self._data.svo['subject'] = self._add_conj(tmp) # Find Predicate tmp = self._find_predicate() self._data.svo['predicate'] = self._add_conj(tmp) # Find Object tmp = self._find_object(self._data.svo['predicate']) self._data.svo['object'] = self._add_conj(tmp) self._all = collections.defaultdict(list) self._flatten(self._data.svo['predicate']) self._data.svo['object'] = self._filter(self._data.svo['object']) for s in self.t.subtrees(): if s.label() != 'ROOT': break else: for i in self.t.subtrees(lambda i:i.label() != 'ROOT'): if i.label() in ['FRAG']: continue if i.label() in ['S','SINV']: for n in i.subtrees(lambda n: n.label() == 'S' and n != i): flag = True test = n while test.parent(): if test.parent() == i: flag = False break test = test.parent() if flag: tmp = self._del(' '.join(n.flatten())) if tmp: self._refresh(n) kid = SVONode(('', self.t), self._data) self._data.child.append(kid) self._queue.append(kid) break break break # Integrate self._result = collections.defaultdict(list) self._traversal(self._root) return self._result def _filter(self, x): for i in x: if i[1] != []: for j in i[1]: if isinstance(j,dict): for k in ['predicate', 'object']: tmp = self._filter(j[k]) if tmp == []: del j[k] else: if j in self._all['predicate']: i[1].remove(j) if i[0] in self._all['predicate']: x.remove(i) return x def _flatten(self, x): for i in x: self._all['predicate'].append(i[0]) if i[1] != []: for j in i[1]: if isinstance(j,dict): for k in j.keys(): self._flatten(j[k]) else: self._all['predicate'].append(j) def _traversal(self, node): if node.svo != None and (node.svo['subject']!=[] or node.svo['predicate']!=[] or node.svo['object']!=[]): self._result[node.relation].append({'subject':node.svo['subject'], 'predicate':node.svo['predicate'], 'object':node.svo['object']}) for i in node.child: self._traversal(i) def _add_conj(self, tmp): result = [] if isinstance(tmp, tuple): flag = tmp[0].split(' ') if len(flag) <= 5: for k in flag: if k in self._dic.keys(): # 把conj補進來 for j in self._dic[k]: if j[0] == 'attr': tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0] tree = ParentedTree.convert(tree) kid = SVONode(('appos', tree), self._data) self._data.child.append(kid) self._queue.append(kid) self._dic[k].remove(j) # a = tmp[0] # b = tmp[1] # result.append((a, b+[j[1]])) else: result.append((j[1], j[2])) if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]: result.append(tmp) result.reverse() return result def _remove_comma(self): for i in self.t.subtrees(lambda i:i[0] in [',', ';']): if i.left_sibling() and i.left_sibling().label() not in ['NP','S','VP','PP','JJ','SINV','ADJP'] and 'VB' not in i.left_sibling().label(): if ' '.join(i.left_sibling().flatten()) != ' '.join(self.t.flatten()): self._refresh(i.left_sibling()) if ' '.join(i.flatten()) != ' '.join(self.t.flatten()): self._refresh(i) # 拔掉的句子放進child def _child(self, a, b): kid = SVONode((a, b), self._data) self._data.child.append(kid) self._queue.append(kid) self._refresh(b, a) # 能否 refresh(拔掉的句子和原有句子是否一樣) def _del(self, tmp_1): tmp = ' '.join(self.t.flatten()) tmp = tmp.replace(tmp_1, '') tmp = tmp.strip(',; ') if tmp != '': return True else: return False def _find_SBAR(self): # 有無對等連接詞 for i in self.t.subtrees(lambda i: i.label() == 'CC'): if i.right_sibling() and i.right_sibling().label() in ['S','VP']: tmp = self._del(i[0]+' '+' '.join(i.right_sibling().flatten())) if tmp and [x for x in self._queue if ' '.join(i.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []: self._child(i[0], i.right_sibling()) # 有無子句 for node in self.t.subtrees(lambda node: node.label() == 'SBAR'): if 'VB' in node.pos()[0][1]: continue tmp = self._del(' '.join(node.flatten())) if tmp: conj = [] # 連接詞 for s in node.subtrees(lambda s: s.label() != 'SBAR'): if s.label() not in ['S','ADVP','RB'] and 'VB' not in s.label(): if s.leaves()[0] not in conj: conj.append(s.leaves()[0]) elif s.label() in ['ADVP','RB']: continue else: break conj = ' '.join(conj) for s in node.subtrees(lambda s: s.label() == 'S'): # SBAR 會重複 if [x for x in self._queue if ' '.join(s.flatten()) in ' '.join(x.data.flatten())] == []: if node.left_sibling() and node.left_sibling().label() == 'IN' and node.parent().label() != 'S': tmp = self._del(' '.join(node.parent().flatten())) if tmp: self._child(conj, s) else: self._child(conj, s) break # 分詞 participle = [x[0] for x in self.t.pos() if x[1] in ['VBG','VBN']] for i in participle: if i in self.t.leaves(): candidate = [x for x, y in enumerate(self.t.leaves()) if y == i] if candidate[-1] == 0: pos = '' else: before = self.t.leaves()[candidate[-1]-1] pos = [x for x in self.t.pos() if x[0] == before][0][1] IN = ['when','while','before','after','till','since','because','as','so','although','though','if','unless','upon','once'] if pos == 'IN' and before.lower() in IN: # candidate[-1]-2 >= 0 and 'VB' not in [x for x in self.t.pos() if x[0] == self.t.leaves()[candidate[-1]-2]][0][1] for j in self.t.subtrees(lambda j: j[0] == before): tmp = self._del(' '.join(j.parent().flatten())) if tmp and j.parent().label() != 'NP' and j.right_sibling() and [x for x in self._queue if ' '.join(j.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []: self._child(before, j.right_sibling()) if ('VB' not in pos) and (pos not in ['IN','RB','MD','POS', 'TO']): for j in self.t.subtrees(lambda j: j[0] == i): tmp = self._del(' '.join(j.parent().flatten())) if tmp and j.parent().label() not in ['NP','ADJP'] and j.right_sibling() and [x for x in self._queue if ' '.join(j.parent().flatten()) in ' '.join(x.data.flatten())] == []: self._child('', j.parent()) def _refresh(self, node, conj=''): sentence = ' '.join(self.t.flatten()) if conj == '': tmp = ' '.join(node.flatten()) else: tmp = conj + ' ' + ' '.join(node.flatten()) if tmp in sentence: idx = sentence.index(tmp) if idx-2 >= 0 and sentence[idx-2] == ',': tmp = ', ' + tmp if idx+len(tmp)+1 < len(sentence) and sentence[idx+len(tmp)+1] == ',': tmp = tmp +' ,' sentence = sentence.replace(tmp, '') self._load(sentence) def _find_conj(self): self._dic = collections.defaultdict(list) dep, = self._dependency.raw_parse(self.original) remover = [] pool_conj = [] pool_appos = [] for governor, bridge, dependent in dep.triples(): # 對等連接詞 if bridge == 'conj': # NN conj NN if 'NN' in governor[1] and 'NN' in dependent[1]: address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['conj'] for add in address: if add not in pool_conj: tmp = [] r = [] pool_conj.append(add) for key, value in dep.get_by_address(add)['deps'].items(): if key not in ['conj', 'cc', 'nmod', 'nmod:poss']: for j in value: tmp.append(dep.get_by_address(j)['word']) r.append(dep.get_by_address(j)['word']) if key in ['nmod']: r.append(dep.get_by_address(add)['word']) for j in value: for key1, value1 in dep.get_by_address(j)['deps'].items(): if key1 not in ['conj', 'cc']: for k in value1: r.append(dep.get_by_address(k)['word']) r.append(dep.get_by_address(j)['word']) if key in ['nmod:poss']: for j in value: for key1, value1 in dep.get_by_address(j)['deps'].items(): if key1 not in ['conj', 'cc', 'case']: for k in value1: tmp.append(dep.get_by_address(k)['word']) r.append(dep.get_by_address(k)['word']) if key1 in ['case']: tmp.append(dep.get_by_address(j)['word']) r.append(dep.get_by_address(j)['word']) for k in value1: tmp.append(dep.get_by_address(k)['word']) r.append(dep.get_by_address(k)['word']) if dep.get_by_address(j)['word'] not in tmp: tmp.append(dep.get_by_address(j)['word']) r.append(dep.get_by_address(j)['word']) if dep.get_by_address(add)['word'] not in tmp: tmp.append(dep.get_by_address(add)['word']) if dep.get_by_address(add)['word'] not in r: r.append(dep.get_by_address(add)['word']) for i in self.t.subtrees(lambda i: i.leaves() == r): for n in i.subtrees(lambda n: n[0] == dependent[0]): self._dic[governor[0]].append(('entity', ' '.join(tmp), self._find_attrs(n, ' '.join(tmp)))) remover.append(' '.join(r)) break break if ' '.join(r) not in remover: self._dic[governor[0]].append(('entity', ' '.join(tmp), [])) remover.append(' '.join(r)) # VB conj VB O elif 'VB' in governor[1] and 'VB' in dependent[1] and governor[1] == dependent[1]: gov_key = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0].keys() dep_key = [x['deps'] for x in dep.nodes.values() if x['word']==dependent[0]][0].keys() if [j for j in gov_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass']]==[] or [j for j in dep_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass', 'nsubj']]==[]: for i in self.t.subtrees(lambda i: i[0] == dependent[0]): self._dic[governor[0]].append(('entity', dependent[0], self._find_attrs(i, dependent[0]))) remover.append(dependent[0]) break # 同位語(回傳整串) elif bridge == 'appos': tmp = [] address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['appos'] for add in address: if add not in pool_appos: tmp = [] pool_appos.append(add) for key, value in dep.get_by_address(add)['deps'].items(): if key in ['compound', 'amod']: for j in value: tmp.append(dep.get_by_address(j)['word']) if key in ['nmod']: tmp.append(dep.get_by_address(add)['word']) for j in value: for key1, value1 in dep.get_by_address(j)['deps'].items(): if key1 not in ['conj', 'cc']: for k in value1: tmp.append(dep.get_by_address(k)['word']) tmp.append(dep.get_by_address(j)['word']) if dep.get_by_address(add)['word'] not in tmp: tmp.append(dep.get_by_address(add)['word']) self._dic[governor[0]].append(('attr', ' '.join(tmp), [])) remover.append(' '.join(tmp)) for i in range(len(remover)): #所有可能的位置 can = [m.start() for m in re.finditer(remover[i], self.original)] flag = False for j in can: if self.original[j-2] == ',': remover[i] = ', ' + remover[i] flag = True break elif self.original[j-4:j-1] == 'and': remover[i] = 'and ' + remover[i] flag = True break if not flag: remover[i] = ' ' + remover[i] return remover # Breadth First Search the tree and take the first noun in the NP subtree. def _find_subject(self): synonym = ['', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why', 'how', 'whether', 'in'] for i in self.t.subtrees(lambda i: i.label() == 'SBAR'): dep, = self._dependency.raw_parse(' '.join(self.t.flatten())) sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']] if sub != []: for s in self.t.subtrees(lambda s:s[0] == sub[0][0]): return self._find_NOUN(s) for s in i.subtrees(lambda s: s.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() in 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) for i in self.t.subtrees(lambda i: i.label() not in ['S', 'ROOT', 'PP', 'FRAG']): # 有Subject dep, = self._dependency.raw_parse(' '.join(self.t.flatten())) sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']] if sub != []: for s in self.t.subtrees(lambda s:s[0] == sub[0][0]): return self._find_NOUN(s) if i.label() not in ['VP','PP'] and 'VB' not in i.label(): for s in self.t.subtrees(lambda s: s.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) # 祈使句 elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation == 'main': if [x for x in self.t.pos()][0][1] not in ['RB','MD'] and 'VB' not in [x for x in self.t.pos()][0][1]: for s in self.t.subtrees(lambda s: s.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) return None else: return None # 沒有subject & relation是代名詞 elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation in synonym: dep, = self._dependency.raw_parse(self.original) candidate = [x for x in dep.triples() if x[1] in ['acl:relcl','acl'] and x[2][0] in self.t.flatten()] if candidate != []: compound = self._find_compound(candidate[0][0][0], dep) sub = [] if compound != '': for com in compound: sub.append(com) sub.append(candidate[0][0][0]) return (' '.join(sub), []) else: sent = [x[0] for x in self.pos] if self._data.relation != '': candidate = [x for x, y in enumerate(sent) if y == self._data.relation.split(' ')[0]] after = self.t.pos()[0][0] else: candidate = [x for x, y in enumerate(sent) if y == self.t.pos()[0][0]] if len(self.t.pos()) > 1: after = self.t.pos()[1][0] else: after = '' before = candidate[0] - 1 for x in candidate: if sent[x+1] == after: before = x - 1 if before == -1: return None # 原句前一個詞是否為NN if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0] or [x[1] for x in self.pos if x[0] == sent[before]][0] in ['PRP']: sub = [sent[before]] before -= 1 while 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: sub.append(sent[before]) before -= 1 return (' '.join(reversed(sub)), []) elif [x[1] for x in self.pos if x[0] == sent[before]][0] in ['IN',','] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]: before -= 1 sub = [sent[before]] before -= 1 while before != -1 and 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: sub.append(sent[before]) before -= 1 return (' '.join(reversed(sub)), []) # 找parent中最近的 else: target = self.t.pos()[0][0] if self._data.parent.svo['subject'] == []: sub = -1 else: sub = self._data.parent.svo['subject'][0][0].split(' ')[-1] if self._data.parent.svo['object'] == []: obj = -1 else: obj = self._data.parent.svo['object'][0][0].split(' ')[-1] if sub == -1 and obj != -1: return self._data.parent.svo['object'] elif sub != -1 and obj == -1: return self._data.parent.svo['subject'] elif sub != -1 and obj != -1: if abs(self.original.find(target)-self.original.find(sub)) <= abs(self.original.find(target)-self.original.find(obj)): return self._data.parent.svo['subject'] else: return self._data.parent.svo['object'] # 沒有subject & relation是連接詞 elif i.label() == 'VP' or i.label().startswith('VB'): if self._data.parent != None: return self._data.parent.svo['subject'] else: return None def _find_compound(self, word, dep): deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word] com = [] deps = [x for x in deps if 'compound' in x] for i in deps: for j in i['compound']: com.append(dep.get_by_address(j)['word']) deps = [x for x in deps if 'dep' in x] for i in deps: com.append(dep.get_by_address(i['dep'][0])['word']) return com def _compound(self, compound, before): obj = [] if compound != '': for n in self.t.subtrees(lambda n:n[0] == before): for com in compound: for s in n.parent().subtrees(lambda s:s[0] == com): obj.append(com) return obj def _dobj(self, candidate, dep, before): if 'dobj' in candidate.keys(): word = dep.nodes[candidate['dobj'][0]]['word'] tag = dep.nodes[candidate['dobj'][0]]['tag'] else: word = dep.nodes[candidate['xcomp'][0]]['word'] tag = dep.nodes[candidate['xcomp'][0]]['tag'] compound = self._find_compound(word, dep) obj = self._compound(compound, before) if tag != 'TO': for n in self.t.subtrees(lambda n:n[0] == before): for s in n.parent().subtrees(lambda s:s[0] == word): obj.append(s[0]) return (' '.join(obj), self._find_attrs(s, ' '.join(obj))) def _find_object(self, predicate, node = '', data = ''): if node == '': node = self.t if data == '': data = self._data synonym = ['which', 'that', 'who', 'whom'] if data != None and data.relation == 'appos': dep, = self._dependency.raw_parse(' '.join(node.flatten())) else: dep, = self._dependency.raw_parse(self.original) for i in predicate: pre = i[0].split(' ') for j in range(len(pre)-1, -1, -1): if len([x['deps'] for x in dep.nodes.values() if x['word']==pre[j]]) > 1: dep, = self._dependency.raw_parse(' '.join(node.flatten())) candidate = [x['deps'] for x in dep.nodes.values() if x['word']==pre[j]][0] candidate_1 = [x for x in dep.triples() if x[2][0]==pre[j]] if 'dobj' in candidate.keys() or 'xcomp' in candidate.keys(): return self._dobj(candidate, dep, pre[j]) elif 'ccomp' in candidate.keys(): word = dep.nodes[candidate['ccomp'][0]]['word'] tag = dep.nodes[candidate['ccomp'][0]]['tag'] dic = collections.defaultdict(list) deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0] if 'nsubj' in deps.keys(): compound = self._find_compound(dep.get_by_address(deps['nsubj'][0])['word'], dep) obj = self._compound(compound, pre[j]) obj.append(dep.get_by_address(deps['nsubj'][0])['word']) if 'dobj' in deps.keys() or 'xcomp' in deps.keys(): for n in self.t.subtrees(lambda n:n[0] == word): dic['predicate'].append((word, self._find_attrs(n, word))) dic['object'] = self._add_conj(self._dobj(deps, dep, word)) return (' '.join(obj), [dic]) elif 'dobj' in deps.keys(): compound = self._find_compound(dep.get_by_address(deps['dobj'][0])['word'], dep) obj = self._compound(compound, pre[j]) for n in self.t.subtrees(lambda n:n[0] == dep.get_by_address(deps['dobj'][0])['word']): obj.append(n[0]) return (' '.join(obj), self._find_attrs(n, ' '.join(obj))) # else: # return None elif 'cop' in [x[1] for x in candidate_1]: tmp = [x for x in candidate_1 if x[1] == 'cop'][0] compound = self._find_compound(tmp[0][0], dep) obj = self._compound(compound, pre[j]) for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) elif 'case' in [x[1] for x in candidate_1]: tmp = [x for x in candidate_1 if x[1] == 'case'][0] compound = self._find_compound(tmp[0][0], dep) obj = self._compound(compound, pre[j]) for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) elif 'auxpass' in candidate.keys(): sent = [x[0] for x in self.pos] if data != None and data.relation in synonym: relation = sent.index(data.relation.split(' ')[0]) if 'IN' in [x[1] for x in self.pos if x[0] == sent[relation]][0]: return (sent[relation-1], []) return None # 沒有受詞 elif data != None and data.relation in synonym: sent = [x[0] for x in self.pos] before = sent.index(data.relation.split(' ')[0])-1 # 原句前一個詞是否為NN if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: return (sent[before], []) elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before]][0] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]: return (sent[before-1], []) elif data.child != []: kid = data.child[0] if kid.relation != 'appos': return (kid.relation+' '+' '.join(kid.data.flatten()), []) else: return None # 受詞為子句 elif data != None and data.child != []: kid = data.child[0] if kid.relation != 'appos': return (kid.relation+' '+' '.join(kid.data.flatten()), []) elif [x for x in dep.nodes.values() if x['word']==pre[j]][0]['tag'] == 'RP': continue else: return None def _find_predicate(self): tmp = self.t.flatten() for n in self.t.subtrees(lambda n: n.label().startswith('VB')): if n.parent().label() in ['ADJP']: continue i = tmp.index(n[0]) sub = [] while self.t.pos()[i-1][1] in ['MD','RB']: sub.append(self.t.pos()[i-1][0]) i -= 1 sub.reverse() i = tmp.index(n[0]) while i+1 < len(tmp): if [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] == 'RP': sub.append(tmp[i]) i += 1 elif [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] in ['RB','MD']: if i+2 >= len(tmp): break count = i+2 while count+1 < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] in ['RB','MD']: count += 1 if count < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] == 'TO': sub.append(tmp[i]) i += 1 else: break else: break flag = i sub.append(tmp[flag]) # 不定詞 for j in self.t.subtrees(lambda j:j[0] == tmp[flag]): if j.right_sibling() and j.right_sibling().label() == 'PP' and j.right_sibling().leaves()[0] != 'to': start = tmp.index(j.right_sibling().leaves()[-1]) has_PP = True else: start = flag has_PP = False if start+1 < len(tmp) and tmp[start+1] == 'to': for i in range(start+1, len(tmp)): if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i]][0] in ['TO','RB']: sub.append(tmp[i]) if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB'): flag = i else: break if has_PP: for i in self.t.subtrees(lambda i:i[0] == sub[-1]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) else: for i in self.t.subtrees(lambda i:i[0] == tmp[flag]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) else: for i in self.t.subtrees(lambda i:i[0] == tmp[flag]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) def _find_NOUN(self, n): # 所有格 if n.parent().right_sibling() and n.parent().right_sibling().label().startswith('NN'): sub = n.parent().leaves() p = n.parent() while p.right_sibling(): if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD','DT']: p = p.right_sibling() sub.append(p[0]) else: break return (' '.join(sub), self._find_attrs(p, ' '.join(sub))) else: sub = [] pp = n.parent() flag = '' for l in pp: if l.label().startswith('NN') or l.label() in ['PRP','CD','DT']: if l[0] not in sub: sub.append(l[0]) flag = l if flag == '': sub.append(n[0]) flag = n return (' '.join(sub), self._find_attrs(flag, ' '.join(sub))) def _find_to(self, node): dic = collections.defaultdict(list) flag = node.leaves().index('to') tmp = node.leaves()[flag:] predicate = [] for i in tmp: if [x[1] for x in self.t.pos() if x[0] == i][0] in 'TO' or 'VB' in [x[1] for x in self.t.pos() if x[0] == i][0]: predicate.append(i) else: break for n in node.subtrees(lambda n: n[0] == predicate[-1]): dic['predicate'].append((' '.join(predicate), self._find_attrs(n, ' '.join(predicate)))) if predicate[-1] == 'be': for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']): if n.label() in ['NP', 'PP']: for c in n.subtrees(lambda c: c.label().startswith('NN') or c.label() in ['PRP', 'CD']): a = self._find_NOUN(c) dic['object'] = self._add_conj(a) return dic else: tmp = self._find_object(dic['predicate'], node, None) dic['object'] = self._add_conj(tmp) return dic def _toV(self, node): # 可能有多個一樣的字 flat = list(self.t.flatten()) candidate = [x for x, y in enumerate(flat) if y == node[0]] flag = candidate[0] if node.left_sibling(): before = node.left_sibling().leaves()[-1] for i in candidate: if flat[i-1] == before: flag = i break elif node.right_sibling(): after = node.right_sibling().leaves()[0] for i in candidate: if flat[i+1] == after: flag = i break elif node.parent().left_sibling(): before = node.parent().left_sibling().leaves()[-1] for i in candidate: if flat[i-1] == before: flag = i break elif node.parent().right_sibling(): after = node.parent().right_sibling().leaves()[0] for i in candidate: if flat[i+1] == after: flag = i break if not node.label().startswith('VB') and flag+2 < len(flat) and flat[flag+1] == 'to' and [x[1] for x in self.t.pos() if x[0] == flat[flag+2]][0] in 'VB': for i in self.t.subtrees(lambda i: i[0] == 'to'): if flat[flag] not in i.parent().flatten(): return i.parent() else: return None def _PP(self, s, name, attrs): if ' '.join(s.flatten()) not in name: if len(s[0]) != 1: for i in s.subtrees(lambda i: i.label() == 'PP'): if i.parent() == s: a = self._proposition(i) if a != []: attrs.append(a) else: attrs.append(' '.join(s.flatten())) else: a = self._proposition(s) if a != []: attrs.append(a) else: attrs.append(' '.join(s.flatten())) return attrs def _find_attrs(self, node, name): attrs = [] p = node.parent() toV = self._toV(node) name = name.split(' ') # Search siblings of adjective for adverbs if node.label().startswith('JJ'): for s in p: if s.label() == 'RB': if s[0] not in name: attrs.append(s[0]) elif s.label() == 'PP': attrs = self._PP(s, name, attrs) elif s.label() == 'NP': if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']: for s in p: if s != node and s.label() in ['DT','PRP$','POS','CD','IN'] or s.label().startswith('JJ') or s.label().startswith('NN'): if s[0] not in name: attrs.append(s[0]) elif s != node and s.label() in ['ADJP','NP','QP', 'VP']: if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif s != p and s.label() in ['PP']: attrs = self._PP(s, name, attrs) # Search siblings of verbs for adverb phrase elif node.label().startswith('VB'): for s in p: # if s.label() in ['ADVP','MD','RB']: if s.label() in ['ADVP', 'RB', 'MD']: if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif s.label() == 'PP': attrs = self._PP(s, name, attrs) # Search uncles # if the node is noun or adjective search for prepositional phrase if node.label().startswith('JJ') or node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']: if p.label() == 'QP': p = p.parent() for s in p.parent(): if s != p and s.label() in ['PP']: attrs = self._PP(s, name, attrs) elif s != p and 'NN' in s.label() or s.label() == 'JJ': if s[0] not in name: attrs.append(s[0]) elif s != p and s.label() == 'VP' and s.parent().label() == 'NP': if ' '.join(s.flatten()) not in name: if toV != None: if ' '.join(s.flatten()[:3]) != ' '.join(toV.flatten()[:3]): attrs.append(' '.join(s.flatten())) else: # self._refresh(s) attrs.append(' '.join(s.flatten())) elif node.label().startswith('VB') or node.label() == 'RP': if p.parent(): tmp = node for s in p.parent(): if s != p and s.label().startswith('ADVP'): if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) # elif s != p and s.label() in ['MD','RB']: # attrs.append(s[0]) elif s != p and s.label() == 'PP' and s == tmp.right_sibling(): attrs = self._PP(s, name, attrs) tmp = s if toV != None: attrs.append(self._find_to(toV)) self._refresh(toV) return attrs def _proposition(self, node): dic = collections.defaultdict(list) tmp = node.leaves() if len(tmp) == 1: return [] for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']): if tmp.index(k[0])+1 < len(tmp): VB = [x for x in node.pos() if x[0] == tmp[tmp.index(k[0])+1]] if VB != [] and 'VB' in VB[0][1]: dic['predicate'].append((k[0]+' '+VB[0][0], [])) else: dic['predicate'].append((k[0], [])) else: dic['predicate'].append((k[0], [])) if k.right_sibling(): for c in k.right_sibling().subtrees(lambda c: c.label().startswith('NN') or c.label() in ['JJ', 'CD']): # 所有格 if c.parent().right_sibling() and c.parent().right_sibling().label().startswith('NN'): sub = c.parent().leaves() p = c.parent() while p.right_sibling(): if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD']: p = p.right_sibling() sub.append(p[0]) flag = p else: break else: sub = [] pp = c.parent() for l in pp: if l.label().startswith('NN') or l.label() in ['PRP','CD', 'JJ']: if l[0] not in sub: sub.append(l[0]) flag = l dic['object'].append((' '.join(sub), self._find_attrs(flag, ' '.join(sub)))) dic['object'] = self._add_conj(dic['object'][0]) return dic return [] else: return [] return []
import sys parser = CoreNLPParser(url='http://localhost:9000') dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = dep_parser.parse('What is the airspeed of an unladen swallow ?'.split()) dp = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses] #normal_parse = parser.raw_parse("What does the important inscription on the tomb of Ankhtifi, a nomarch during the early First Intermediate Periodi, describe?") text = "During the Old Kingdom, the king of Egypt (not called the Pharaoh until the New Kingdom) became a living god who ruled absolutely and could demand the services and wealth of his subjects." actual_parse = parser.raw_parse(text) actual_tree = [t for t in actual_parse][0] actual_tree.pretty_print() def findRelatives(t, label=None, word=None): if label is None and word is None: print("please specify either the label or the word to search for") return None q = [(t,[])] curr = 0 while (curr < len(q)):
import wikipedia datum = (wikipedia.page(title="School").content) #doc = nlp(datum) (requests.post( 'http://[::]:9000/?properties={"annotators":"tokenize,ssplit,pos","outputFormat":"json"}', data={ 'data': datum }).text) #server.start() question = [] nlp = spacy.load('en_core_web_sm') parser = CoreNLPParser() parse = next(parser.raw_parse("I put the book in the box on the table.")) class SST(): def __init__(self, label, children): self.label = label self.children = children # Sentence Structure Leaf class SSL(): def __init__(self, label): self.label = label simple_predicate = SST('ROOT', [SST('S', [SSL('NP'), SSL('VP'), SSL('.')])])
class StanfordClient: def __init__(self, core_nlp_version: str = '2018-10-05', annotators=None): if annotators is None or not isinstance(annotators, list): annotators = ['openie', 'dcoref'] self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path('~/.stanfordnlp_resources/').expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading from %s.' % self.remote_url) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.client = CoreNLPClient(annotators=annotators, memory='8G') self.parser = CoreNLPParser() def parse(self, text: str, properties_key: str = None, properties: dict = None, output_format='json'): core_nlp_output = self.client.annotate(text=text, annotators=['parse'], output_format=output_format, properties_key=properties_key, properties=properties) return core_nlp_output def nltk_parse(self, text: str): return [tree for tree in self.parser.raw_parse(text)][0] def pos(self, text: str, properties_key: str = None, properties: dict = None): core_nlp_output = self.client.annotate(text=text, annotators=['pos'], output_format='json', properties_key=properties_key, properties=properties) return core_nlp_output def kbp(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True): core_nlp_output = self.client.annotate(text=text, annotators=['kbp'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: return self.__parse_triples(core_nlp_output, key='kbp') else: return core_nlp_output def openie(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict. :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>. """ # https://stanfordnlp.github.io/CoreNLP/openie.html core_nlp_output = self.client.annotate(text=text, annotators=['openie'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: return self.__parse_triples(core_nlp_output, key='openie') else: return core_nlp_output @staticmethod def __parse_triples(core_nlp_output, key): triples = [] for sentence in core_nlp_output['sentences']: for triple in sentence[key]: triples.append({ 'subject': triple['subject'], 'relation': triple['relation'], 'object': triple['object'] }) return triples def coref(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True): core_nlp_output = self.client.annotate(text=text, annotators=['dcoref'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: chains = [] for _, chain in core_nlp_output['corefs'].items(): if len(chain) > 1: # there is a coreference found chains.append([link['text'] for link in chain]) return chains return core_nlp_output def generate_graphviz_graph(self, text: str, png_filename: str = './out/graph.png'): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) png_filename: list of annotators to use """ entity_relations = self.openie(text, simple_format=True) """digraph G { # a -> b [ label="a to b" ]; # b -> c [ label="another label"]; }""" graph = list() graph.append('digraph {') for er in entity_relations: graph.append('"{}" -> "{}" [ label="{}" ];'.format( er['subject'], er['object'], er['relation'])) graph.append('}') output_dir = os.path.join('..', os.path.dirname(png_filename)) if not os.path.exists(output_dir): os.makedirs(output_dir) out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot') with open(out_dot, 'w') as output_file: output_file.writelines(graph) command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename) dot_process = Popen(command, stdout=stderr, shell=True) dot_process.wait() assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.' def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): self.client.stop() if 'CORENLP_HOME' in os.environ: del os.environ['CORENLP_HOME']
import os import nltk from nltk.parse.corenlp import CoreNLPServer from nltk.parse.corenlp import CoreNLPParser from nltk.parse.corenlp import CoreNLPDependencyParser STANFORD = "stanford-corenlp-full-2018-10-05" jars = ( os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"), os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"), ) text = "turn right and go up the stairs and stand at the top." #text = "Walk out of the closet and into the hallway. Walk through the hallway entrance on the left. Stop just inside the entryway." #text = "Turn, putting the exit of the building on your left. Walk to the end of the entrance way and turn left. Travel across the kitchen area with the counter and chairs on your right. Continue straight until you reach the dining room. Enter the room and stop and wait one meter from the closest end of the long dining table." print(text) with CoreNLPServer(*jars): parser = CoreNLPParser() for i in parser.parse_text(text): print(i) parser = CoreNLPDependencyParser() for i in parser.raw_parse(text): print(i)
'Install App and import local SMS from the device.', 'hey i have a problem and that is after building project when i try to edit the saved video, app crashesh on multiple devices (m above).', 'During multiple input of identical values e.g. 10 km (by day) and 10 litres some calculations fail. E.g. first calculates correcty 100 l/km but then next two results are \"infinity l / 100 km.', 'Plotting will death lock program then and you have to restart device.' ] result = [] for sent in comp_sent_rec: # with open(sys.argv[1], 'r') as file: # lines = file.readlines() # for sent in lines: parse = next(parser.raw_parse(sent)) t = Tree.fromstring(str(parse)) t.pretty_print() # print(t.leaves()) for i in break_up_sent(t): result.append(i) print(i) # result = break_up_sent(t) # print(result) # print() # for i in result: # print(i)
with open(filename, encoding='utf-8') as f: file_content = f.read().replace('\n', '') file_content = ' '.join(file_content.split()) lines = sent_tokenize(file_content) lines = [line for line in lines if line != '' and len(word_tokenize(line)) <= 10 and line[-1] in '.?!' and line[0].isupper()] print(len(lines)) wrong_lines_count = 0 pic_count = 0 for i, line in enumerate(lines): if wrong_lines_count == 5: break print('Original line: ' + line) tree = next(parser.raw_parse(line)) if pic_count < 5 and word_tokenize(line) == 10: filename = get_valid_filename(line) TreeView(tree)._cframe.print_to_file(filename + '.ps') pic_count += 1 errors = ATD.checkDocument(line) if len(list(errors)) == 0: print('**No errors** ({}/{})'.format(i + 1, len(lines))) continue else: print() correct_line = correct(line, errors) tree.pretty_print() print('Correct line: ' + correct_line) correct_tree = next(parser.raw_parse(correct_line)) correct_tree.pretty_print()
def show_parse_tree(self, query): # show it with tree from nltk.parse.corenlp import CoreNLPParser cnlp = CoreNLPParser('http://localhost:9000') next(cnlp.raw_parse(query)).draw()
path_list = [] examples = [] augment_examples = [] res = open(tag_path, 'w') with open(train_path) as f: for i, line in enumerate(f): label = line[0] text = line[2:] if i % 100 == 0: print(i) path_list = [] try: parsed_sen = next(parser.raw_parse(text)) except requests.exceptions.HTTPError: print(str(i + 1) + ' : pass sentence') continue except StopIteration: print(str(i + 1) + ' : StopIteration') continue path_list = traverse_tree(parsed_sen, word_path) tag_dict = {} for p in position_set: tag_dict[p] = [] for words in path_list: length = len(words[1])
def parser_generator(_words): lemma_sent = " ".join(_words) parser = CoreNLPParser(url='http://localhost:9000') # next(parser.raw_parse(lemma_sent)).pretty_print() return parser.raw_parse(lemma_sent)
else: label = tree.label() # print(type(label)) return reduce(lambda x, y: Tree(label, (binarize(x), binarize(y))), tree) # In[7]: import os parser = CoreNLPParser(url='http://localhost:8000') # In[8]: my_sentence = "I love you." t, = parser.raw_parse(my_sentence) # t.draw() bt = binarize(t) # bt.draw() tree = bt.pformat() input = SenTreeTest.getTree(tree) # input.draw() model.eval() # print(type(dev[0])) # print(type(input)) predictions, loss = model.getLoss(input) # print((predictions.data)) # print((model.labelList.data)) pred = predictions.data label = model.labelList.data
lisDic[i] = jaccard_distance(question, token_sen) minInd = min(lisDic, key=lisDic.get) relevent.append(sentences[minInd]) return relevent def ans_type(question, relevent): wh_root = determine_wh(question) if wh_root in aux_words: ans_binary(question, relevent) else: ans_wh(question, relevent) q_tree = parser.raw_parse(question) r_tree = parser.raw_parse(relevent) q_tree = list(parser.raw_parse(questions[0])) r_tree = list(parser.raw_parse(relevent[0])) newtree = ParentedTree.convert(r_tree[0]) newtree.draw() for subtree in newtree: print(subtree.left_sibling()) print(q_tree) for item in q_tree: print(item.label())
class SVO(): def __init__(self, sentence): config = ApplicationConfig.get_corenlp_config() self._parser = CoreNLPParser( url=f"http://{config['host']}:{config['port']}") self._dependency = CoreNLPDependencyParser( url=f"http://{config['host']}:{config['port']}") sentence = sentence.replace(' ', ' ') sentence = sentence.replace('.', '') self._load(sentence) self.original = sentence # self._ner = self._parser.tag(sentence.split(' ')) def _load(self, sentence): self.t = list(self._parser.raw_parse(sentence))[0] self.t = ParentedTree.convert(self.t) def show(self): self.t.pretty_print() def find_svo(self): self._queue = [] # sentence須為S或NP才能找SVO & find conj for i in self.t.subtrees(lambda i: i.label() != 'ROOT'): if i.label() in ['S', 'NP']: remover = self._find_conj() # print(remover) # refresh for i in remover: self.original = self.original.replace(i, '') self._load(self.original) self.pos = self.t.pos() self._root = SVONode(('main', self.t), None) self._queue.append(self._root) break else: # return [], [] return 'Sentence can not find SVO.' # find SVO while self._queue != []: data = self._queue.pop(0) sentence = ' '.join(data.data.flatten()) self._load(sentence) # 找子句 & 對等連接詞 & 分詞 # self.show() self._find_SBAR(data) # self.show() self._remove_comma() # self.show() data.svo = collections.defaultdict(list) # Find Subject tmp = self._find_subject(data) if isinstance(tmp, list): data.svo['subject'] = tmp else: data.svo['subject'] = self._add_conj(tmp) # print(data.svo['subject']) # Find Predicate tmp = self._find_predicate() data.svo['predicate'] = self._add_conj(tmp) # print(data.svo['predicate']) # Find Object tmp = self._find_object(data, data.svo['predicate']) data.svo['object'] = self._add_conj(tmp) # print(data.svo['object']) # Integrate result = collections.defaultdict(list) result = self._traversal(self._root, result) return result def _traversal(self, node, result): if node.svo['subject'] != [] or node.svo[ 'predicate'] != [] or node.svo['object'] != []: result[node.relation].append({ 'subject': node.svo['subject'], 'predicate': node.svo['predicate'], 'object': node.svo['object'] }) for i in node.child: result = self._traversal(i, result) return result def _add_conj(self, tmp): result = [] if isinstance(tmp, tuple): flag = tmp[0].split(' ') if len(flag) <= 5: for k in flag: if k in self._dic.keys(): # 把conj補進來 for j in self._dic[k]: if j[0] == 'attr': a = tmp[0] b = tmp[1] result.append((a, b + [j[1]])) else: result.append((j[1], j[2])) if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]: result.append(tmp) return result def _remove_comma(self): for i in self.t.subtrees(lambda i: i[0] == ','): if i.left_sibling() and i.left_sibling().label() not in [ 'NP', 'S', 'VP' ]: if ' '.join(i.left_sibling().flatten()) != ' '.join( self.t.flatten()): self._refresh(i.left_sibling()) if ' '.join(i.flatten()) != ' '.join(self.t.flatten()): self._refresh(i) def _find_SBAR(self, data): # 有無對等連接詞 for i in self.t.subtrees(lambda i: i.label() == 'CC'): if i.right_sibling() and i.right_sibling().label() in ['S', 'VP']: if [ x for x in self._queue if ' '.join(i.right_sibling( ).flatten()) in ' '.join(x.data.flatten()) ] == [] and i[0] + ' ' + ' '.join( i.right_sibling().flatten()) != ' '.join( self.t.flatten()): kid = SVONode((i[0], i.right_sibling()), data) data.child.append(kid) self._queue.append(kid) # refresh sentence = ' '.join(self.t.flatten()) tmp = i[0] + ' ' + ' '.join(i.right_sibling().flatten()) sentence = sentence.replace(tmp, '') self._load(sentence) # 有無子句 for node in self.t.subtrees(lambda node: node.label() == 'SBAR'): if ' '.join(node.flatten()) != ' '.join(self.t.flatten()): conj = [] # 連接詞 for s in node.subtrees(lambda s: s.label() != 'SBAR'): if s.label() != 'S': if s.leaves()[0] not in conj: conj.append(s.leaves()[0]) else: break conj = ' '.join(conj) for s in node.subtrees(lambda s: s.label() == 'S'): # SBAR 會重複 if [ x for x in self._queue if ' '.join(s.flatten()) in ' '.join(x.data.flatten()) ] == []: kid = SVONode((conj, s), data) data.child.append(kid) self._queue.append(kid) if node.left_sibling() and node.left_sibling().label( ) == 'IN' and node.parent().label() != 'S': self._refresh(node.parent()) else: self._refresh(node) break # 分詞 participle = [x[0] for x in self.t.pos() if x[1] in ['VBG', 'VBN']] for i in participle: if i in self.t.leaves(): candidate = [ x for x, y in enumerate(self.t.leaves()) if y == i ] before = self.t.leaves()[candidate[-1] - 1] pos = [x for x in self.t.pos() if x[0] == before][0][1] if pos == 'IN' and candidate[-1] - 2 >= 0 and 'VB' not in [ x for x in self.t.pos() if x[0] == self.t.leaves()[candidate[-1] - 2] ][0][1]: for j in self.t.subtrees(lambda j: j[0] == before): if j.parent().label() != 'NP' and j.right_sibling( ) and [ x for x in self._queue if ' '.join(j.right_sibling().flatten()) in ' '.join(x.data.flatten()) ] == [] and ' '.join(j.parent().flatten()) != ' '.join( self.t.flatten()): kid = SVONode((before, j.right_sibling()), data) data.child.append(kid) self._queue.append(kid) self._refresh(j.parent()) elif ('VB' not in pos) and (pos not in ['IN', 'RB', 'MD', 'POS']): for j in self.t.subtrees(lambda j: j[0] == i): if j.parent().label() not in [ 'NP', 'ADJP' ] and j.right_sibling() and [ x for x in self._queue if ' '.join(j.parent( ).flatten()) in ' '.join(x.data.flatten()) ] == [] and ' '.join(j.parent().flatten()) != ' '.join( self.t.flatten()): kid = SVONode(('', j.parent()), data) data.child.append(kid) self._queue.append(kid) self._refresh(j.parent()) def _refresh(self, node): sentence = ' '.join(self.t.flatten()) tmp = ' '.join(node.flatten()) sentence = sentence.replace(tmp, '') self._load(sentence) def _find_conj(self): self._dic = collections.defaultdict(list) dep, = self._dependency.raw_parse(self.original) remover = [] for governor, bridge, dependent in dep.triples(): # 對等連接詞 if bridge == 'conj': # NN conj NN if 'NN' in governor[1] and 'NN' in dependent[1]: tmp = [] for key, value in [ x['deps'] for x in dep.nodes.values() if x['word'] == dependent[0] ][0].items(): if key not in ['conj', 'cc']: tmp.append(dep.get_by_address(value[0])['word']) tmp.append(dependent[0]) for i in self.t.subtrees(lambda i: i[0] == dependent[0]): self._dic[governor[0]].append( ('entity', ' '.join(tmp), self._find_attrs(i, ' '.join(tmp)))) remover.append(' '.join(tmp)) break # VB conj VB O elif 'VB' in governor[1] and 'VB' in dependent[1]: gov_key = [ x['deps'] for x in dep.nodes.values() if x['word'] == governor[0] ][0].keys() dep_key = [ x['deps'] for x in dep.nodes.values() if x['word'] == dependent[0] ][0].keys() if [j for j in gov_key if j in ['dobj', 'xcomp', 'ccomp'] ] == [] or [ j for j in dep_key if j in ['dobj', 'xcomp', 'ccomp'] ] == []: for i in self.t.subtrees( lambda i: i[0] == dependent[0]): self._dic[governor[0]].append( ('entity', dependent[0], self._find_attrs(i, dependent[0]))) remover.append(dependent[0]) break # 同位語(回傳整串) elif bridge == 'appos': tmp = [] for i in [ x['deps'] for x in dep.nodes.values() if x['word'] == dependent[0] ][0].values(): tmp.append(dep.get_by_address(i[0])['word']) tmp.append(dependent[0]) self._dic[governor[0]].append(('attr', ' '.join(tmp), [])) remover.append(' '.join(tmp)) for i in range(len(remover)): #所有可能的位置 can = [m.start() for m in re.finditer(remover[i], self.original)] for j in can: if self.original[j - 2] == ',': remover[i] = ', ' + remover[i] break elif self.original[j - 4:j - 1] == 'and': remover[i] = 'and ' + remover[i] break return remover # Breadth First Search the tree and take the first noun in the NP subtree. def _find_subject(self, data): synonym = [ '', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why', 'how', 'whether', 'in' ] for i in self.t.subtrees( lambda i: i.label() != 'S' and i.label() != 'ROOT'): # 有Subject if i.label() not in ['VP', 'PP'] and 'VB' not in i.label(): for s in self.t.subtrees(lambda t: t.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) # 沒有subject & relation是代名詞 elif i.label() != 'S' and i.label( ) == 'VP' and data.relation in synonym: sent = [x[0] for x in self.pos] if data.relation != '': candidate = [ x for x, y in enumerate(sent) if y == data.relation.split(' ')[0] ] after = self.t.pos()[0][0] else: candidate = [ x for x, y in enumerate(sent) if y == self.t.pos()[0][0] ] after = self.t.pos()[1][0] before = candidate[0] - 1 for x in candidate: if sent[x + 1] == after: before = x - 1 # 原句前一個詞是否為NN if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0] or [ x[1] for x in self.pos if x[0] == sent[before] ][0] in ['PRP']: sub = [sent[before]] before -= 1 while 'NN' in [ x[1] for x in self.pos if x[0] == sent[before] ][0]: sub.append(sent[before]) before -= 1 return (' '.join(reversed(sub)), []) elif [x[1] for x in self.pos if x[0] == sent[before] ][0] in ['IN', ','] and 'NN' in [ x[1] for x in self.pos if x[0] == sent[before - 1] ][0]: before -= 1 sub = [sent[before]] before -= 1 while 'NN' in [ x[1] for x in self.pos if x[0] == sent[before] ][0]: sub.append(sent[before]) before -= 1 return (' '.join(reversed(sub)), []) # 找parent中最近的 else: target = self.t.pos()[0][0] if data.parent.svo['subject'] == []: sub = -1 else: sub = data.parent.svo['subject'][0][0].split(' ')[-1] if data.parent.svo['object'] == []: obj = -1 else: obj = data.parent.svo['object'][0][0].split(' ')[-1] if sub == -1 and obj != -1: return data.parent.svo['object'] elif sub != -1 and obj == -1: return data.parent.svo['subject'] elif sub != -1 and obj != -1: if abs( self.original.find(target) - self.original.find(sub)) <= abs( self.original.find(target) - self.original.find(obj)): return data.parent.svo['subject'] else: return data.parent.svo['object'] # 沒有subject & relation是連接詞 elif i.label() != 'S' and (i.label() == 'VP' or i.label().startswith('VB')): if data.parent != None: return data.parent.svo['subject'] else: return None def _find_compound(self, word, dep): deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0] if 'compound' in deps: return dep.get_by_address(deps['compound'][0])['word'] else: return '' def _find_object(self, data, predicate): synonym = ['which', 'that', 'who', 'whom'] dep, = self._dependency.raw_parse(' '.join(self.t.flatten())) for i in predicate: pre = i[0].split(' ') for j in range(len(pre) - 1, -1, -1): for governor, bridge, dependent in dep.triples(): if governor[0] == pre[j] and bridge in ['dobj', 'xcomp']: obj = [] compound = self._find_compound(dependent[0], dep) if compound != '': obj.append(compound) if dependent[1] != 'TO': for j in self.t.subtrees( lambda j: j[0] == dependent[0]): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) elif governor[0] == pre[j] and bridge == 'ccomp': dic = collections.defaultdict(list) deps = [ x['deps'] for x in dep.nodes.values() if x['word'] == dependent[0] ][0] if 'nsubj' in deps: obj = [] compound = self._find_compound( dep.get_by_address(deps['nsubj'][0])['word'], dep) if compound != '': obj.append(compound) obj.append( dep.get_by_address(deps['nsubj'][0])['word']) if 'dobj' in deps: dic['predicate'].append(dependent[0]) for j in self.t.subtrees( lambda j: j[0] == dep.get_by_address( deps['dobj'][0])['word']): dic['object'].append( (j[0], self._find_attrs(j, j[0]))) return (' '.join(obj), [dic]) elif 'dobj' in deps: obj = [] compound = self._find_compound( dep.get_by_address(deps['dobj'][0])['word'], dep) if compound != '': obj.append(compound) for j in self.t.subtrees( lambda j: j[0] == dep.get_by_address(deps[ 'dobj'][0])['word']): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) elif dependent[0] == pre[j] and bridge == 'cop': obj = [] compound = self._find_compound(governor[0], dep) if compound != '': obj.append(compound) for j in self.t.subtrees( lambda j: j[0] == governor[0]): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) # 沒有受詞 if data != None and data.relation in synonym: sent = [x[0] for x in self.pos] before = sent.index(data.relation.split(' ')[0]) - 1 # 原句前一個詞是否為NN if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: return (sent[before], []) elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before] ][0] and 'NN' in [ x[1] for x in self.pos if x[0] == sent[before - 1] ][0]: return (sent[before - 1], []) # 受詞為子句 elif data != None and data.child != []: kid = data.child[0] return (kid.relation + ' ' + ' '.join(kid.data.flatten()), []) else: return None def _find_predicate(self): for s in self.t.subtrees(lambda s: s.label() == 'VP'): tmp = s.flatten() for n in s.subtrees(lambda n: n.label().startswith('VB')): i = tmp.index(n[0]) sub = [] while i + 1 < len(tmp): if [x[1] for x in self.t.pos() if x[0] == tmp[i + 1]][0].startswith('VB'): sub.append(tmp[i]) i += 1 elif [x[1] for x in self.t.pos() if x[0] == tmp[i + 1]][0] in ['RB', 'MD']: count = i + 2 while count < len(tmp) and [ x[1] for x in self.t.pos() if x[0] == tmp[count] ][0] in ['RB', 'MD']: count += 1 if count < len(tmp) and [ x[1] for x in self.t.pos() if x[0] == tmp[count] ][0].startswith('VB'): sub.append(tmp[i]) i += 1 else: break else: break flag = i sub.append(tmp[flag]) # 不定詞 for j in self.t.subtrees(lambda j: j[0] == tmp[flag]): if j.right_sibling() and j.right_sibling().label( ) == 'PP' and j.right_sibling().leaves()[0] != 'to': start = tmp.index(j.right_sibling().leaves()[-1]) has_PP = True else: start = flag has_PP = False if start + 1 < len(tmp) and tmp[start + 1] == 'to': for i in range(start + 1, len(tmp)): if [x[1] for x in self.t.pos() if x[0] == tmp[i] ][0].startswith('VB') or [ x[1] for x in self.t.pos() if x[0] == tmp[i] ][0] == 'TO': sub.append(tmp[i]) else: break if has_PP: for i in self.t.subtrees( lambda i: i[0] == sub[-1]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) else: for i in self.t.subtrees( lambda i: i[0] == tmp[flag]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) else: for i in self.t.subtrees(lambda i: i[0] == tmp[flag]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) for s in self.t.subtrees(lambda s: s.label().startswith('VB')): return (s[0], []) def _find_NOUN(self, n): # 所有格 if n.parent().right_sibling() and n.parent().right_sibling().label( ).startswith('NN'): sub = n.parent().leaves() p = n.parent() while p.right_sibling(): if p.right_sibling().label().startswith( 'NN') or p.right_sibling().label() in ['PRP', 'CD']: p = p.right_sibling() sub.append(p[0]) else: break return (' '.join(sub), self._find_attrs(p, ' '.join(sub))) else: sub = [] pp = n.parent() for l in pp: if l.label().startswith('NN') or l.label() in ['PRP', 'CD']: if l[0] not in sub: sub.append(l[0]) flag = l return (' '.join(sub), self._find_attrs(flag, ' '.join(sub))) def _find_to(self, node): dic = collections.defaultdict(list) tmp = node.flatten() predicate = [] for i in tmp: if [x[1] for x in self.t.pos() if x[0] == i][0] == 'TO' or 'VB' in [ x[1] for x in self.t.pos() if x[0] == i ][0]: predicate.append(i) else: break dic['predicate'].append((' '.join(predicate), [])) if predicate[-1] == 'be': for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']): if n.label() in ['NP', 'PP']: for c in n.subtrees(lambda c: c.label().startswith('NN') or c.label() in ['PRP', 'CD']): a = self._find_NOUN(c) dic['object'] = self._add_conj(a) return dic else: tmp = self._find_object(None, dic['predicate']) dic['object'] = self._add_conj(tmp) return dic def _find_attrs(self, node, name): attrs = [] p = node.parent() flat = list(self.t.flatten()) # 可能有多個一樣的字 candidate = [x for x, y in enumerate(flat) if y == node[0]] flag = candidate[0] if node.left_sibling(): before = node.left_sibling().leaves()[-1] for i in candidate: if flat[i - 1] == before: flag = i break elif node.right_sibling(): after = node.right_sibling().leaves()[0] for i in candidate: if flat[i + 1] == after: flag = i break if not node.label().startswith('VB') and flag + 2 < len(flat) and flat[ flag + 1] == 'to' and [ x[1] for x in self.t.pos() if x[0] == flat[flag + 2] ][0] == 'VB': for i in self.t.subtrees(lambda i: i[0] == 'to'): if flat[flat.index(node[0]) + 2] in i.parent().flatten(): toV = i.parent() else: toV = None # Search siblings of adjective for adverbs if node.label().startswith('JJ'): for s in p: if s.label() == 'RB': if s[0] not in name: attrs.append(s[0]) elif s.label() == 'PP': if ' '.join(s.flatten()) not in name: a = self._proposition(s) if a != []: attrs.append(a) elif s.label() == 'NP': if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif node.label().startswith('NN') or node.label() in [ 'PRP', 'CD', 'DT' ]: for s in p: if s != node and s.label() in [ 'DT', 'PRP$', 'POS', 'CD', 'IN', 'VBG', 'VBN' ] or s.label().startswith('JJ'): if s[0] not in name: attrs.append(s[0]) elif s != node and s.label() in ['ADJP', 'NP', 'QP', 'VP']: if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) # Search siblings of verbs for adverb phrase elif node.label().startswith('VB'): tmp = node for s in p: # if s.label() in ['ADVP','MD','RB']: if s.label() in ['ADVP', 'RB', 'MD']: if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) tmp = s elif s.label() == 'PP' and s == tmp.right_sibling(): if ' '.join(s.flatten()) not in name: a = self._proposition(s) if a != []: attrs.append(a) tmp = s # Search uncles # if the node is noun or adjective search for prepositional phrase if node.label().startswith('JJ') or node.label().startswith( 'NN') or node.label() in ['PRP', 'CD', 'DT']: for s in p.parent(): if s != p and s.label() in ['PP', 'IN']: if ' '.join(s.flatten()) not in name: a = self._proposition(s) if a != []: attrs.append(a) elif s != p and s.label() == 'VP' and s.parent().label( ) == 'NP': if ' '.join(s.flatten()) not in name: self._refresh(s) attrs.append(' '.join(s.flatten())) # # 不定詞 # elif s != p and s.label() == 'S' and 'to' in s.flatten() and s.flatten() != toV.flatten(): # attrs.append(self._find_to(s)) elif node.label().startswith('VB'): for s in p.parent(): if s != p and s.label().startswith('ADVP'): if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) # elif s != p and s.label() in ['MD','RB']: # attrs.append(s[0]) elif s != p and s.label() == 'PP' and s == node.right_sibling( ): if ' '.join(s.flatten()) not in name: a = self._proposition(s) if a != []: attrs.append(a) if toV != None: attrs.append(self._find_to(toV)) self._refresh(toV) return attrs def _proposition(self, node): dic = collections.defaultdict(list) for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']): dic['predicate'].append((k[0], [])) if k.right_sibling(): for c in k.right_sibling().subtrees(lambda c: c.label( ).startswith('NN') or c.label() in ['JJ', 'CD']): # 所有格 if c.parent().right_sibling() and c.parent().right_sibling( ).label().startswith('NN'): sub = c.parent().leaves() p = c.parent() while p.right_sibling(): if p.right_sibling().label().startswith( 'NN') or p.right_sibling().label() in [ 'PRP', 'CD' ]: p = p.right_sibling() sub.append(p[0]) else: break else: sub = [] pp = c.parent() for l in pp: if l.label().startswith('NN') or l.label() in [ 'PRP', 'CD', 'JJ' ]: if l[0] not in sub: sub.append(l[0]) flag = l dic['object'].append((' '.join(sub), [])) return dic else: return [] return []
def standford_parse_tree(sentence): parser = CoreNLPParser() return next(parser.raw_parse(sentence))