file_name = '/home/momo/Dropbox/parse_trees/' + file_name #file_name = "./"+file_name """ try: tree = pp.parse_sentence(s,'stat') #numerate_non_terminals(tree) dot_code = utils.nltk_tree_to_dot(tree) print dot_code utils.dot_to_image(dot_code, file_name + '_stat') print except : print 'cannot parse with stat' """ try: tree = pp.parse_sentence(s, 'stanford') #numerate_non_terminals(tree) tree = tree[0] dot_code = utils.nltk_tree_to_dot(tree) utils.dot_to_image(dot_code, file_name + '_stanford') #dot_code = utils.list_of_tripels_to_dot(dep) #utils.dot_to_image(dot_code, file_name + '_dep_stanford') print "stanford done" except: print 'cannot parse with stanford' """ try : tree = pp.parse_sentence(s,'berkeley') tree = tree[0]
else: print "=====================================================================" pattern_dict = load_pattern_list() #for i in pattern_dict.items() : # print i #raw_input() #s = "The Anaconda, or Water Boa, is one the world's largest snakes, when born they can be 3 feet (1m) long." #s = ' '.join(sys.argv[1:]) sentences = sent_tokenize(s) for s in sentences: count += 1 tree = pp.parse_sentence(s, parser) tree = tree[0] #tree = Tree('S', [Tree('NP', [Tree('NNP', ['Leon'])]), Tree('VP', [Tree('VBZ', ['hits']), Tree('NP', [Tree('NNP', ['Kai'])])]), Tree('.', ['.'])]) path = utils.get_knoex_path() dot_code = utils.nltk_tree_to_dot(tree) utils.dot_to_image(dot_code, 'temptree_' + str(count)) if show == 2: os.popen('gnome-open ' + 'temptree_' + str(count) + '.png') g, _ = match_tree(tree, pattern_dict) graph += g while ['', '', ''] in graph: graph.remove(['', '', '']) print graph
return tree.leaves() for subtree in tree : terminals = get_terminals(subtree,node) if terminals != None : return terminals if __name__ == '__main__': from nltk import Tree import string import preprocessor tree = Tree('A',[Tree('A',['A','A']),'A']) tree = preprocessor.parse_sentence('Leon hits Kai.') print 'Tree:', tree print numerate_non_terminals(tree) print 'Num_Tree:', tree print print 'NP0:', get_terminals(tree,'NP0') print 'NP1:', get_terminals(tree,'NP1') print 'NP2:', get_terminals(tree,'NP2') print 'NP3:', get_terminals(tree,'NP3') print 'NP4:', get_terminals(tree,'NP4') print combis = all_parsing_combinations(tree)
def match_to_joined_terminals(match, parsetree): list_ = [] for path in match : list_.append(join(parsetree.get_terminals(path))) return list_ def intersect(l1,l2): s = set(l1).intersection(set(l2)) return list(s) if __name__=="__main__": import preprocessor as pp tree = pp.parse_sentence('The python hits Kai.') tree = tree[0] pt = ParseTree(tree) print for item in pt.nt_dict.items() : print item print for path in pt.nodepaths : print path print print 'test get_subtree' print pt.get_subtree((1,)) print 'test get_node' print pt.get_node((1,)) print 'test get_terminals'
else : print "=====================================================================" pattern_dict = load_pattern_list() #for i in pattern_dict.items() : # print i #raw_input() #s = "The Anaconda, or Water Boa, is one the world's largest snakes, when born they can be 3 feet (1m) long." #s = ' '.join(sys.argv[1:]) sentences = sent_tokenize(s) for s in sentences: count+=1 tree = pp.parse_sentence(s,parser) tree = tree[0] #tree = Tree('S', [Tree('NP', [Tree('NNP', ['Leon'])]), Tree('VP', [Tree('VBZ', ['hits']), Tree('NP', [Tree('NNP', ['Kai'])])]), Tree('.', ['.'])]) path = utils.get_knoex_path() dot_code = utils.nltk_tree_to_dot(tree) utils.dot_to_image(dot_code, 'temptree_'+str(count)) if show == 2: os.popen('gnome-open ' + 'temptree_'+str(count)+'.png') g,_ = match_tree(tree, pattern_dict) graph += g while ['','',''] in graph: graph.remove(['','','']) print graph
def find_realation(text): # HEARST PATTERNS hp1 = 'NP\d+ such as ((NP\d+ ,d+ )+(and |or ))?NP\d+' def m2r_1(match,tree): NPs = re.findall(r'NP\d+', match) NPs = [string.join(tc.get_terminals(tree,NP)) for NP in NPs] return [(NP,'hyponym',NPs[0]) for NP in NPs[1:]] hp2 = 'NP0 VBZ0 NP1 \.0' def m2r_2(match,tree): subject = ' '.join(tc.get_terminals(tree,'NP0')) predicate = ' '.join(tc.get_terminals(tree,'VBZ0')) object_ = ' '.join(tc.get_terminals(tree,'NP1')) return [(subject,predicate,object_)] hp3 = 'NP\d+ is NP\d+' def m2r_3(match,tree): NNs = re.findall(r'NP\d+', match) NNs = [string.join(tc.get_terminals(tree,NN)) for NN in NNs] return [(NN,'hyponym',NNs[0]) for NN in NNs[1:]] # m2r = [ m2r_1, m2r_2, m2r_3 ] # functions to map matches on relations # pattern_list = [hp1,hp2,hp3] m2r = [ m2r_2 ] # functions to map matches on relations pattern_list = [hp2] sentences = split_into_sentences(text) # The next part is rather unclean, but it's hopefully gonna work # Produces a string where every noun phrase is preplaced by NP1,... to NPn # Then a regex search for hearst patterns is applied to find hyponym relations relations = [] for s in sentences: s = format_sentence(s) s +=' .' # adding a fullstop in the end to satisfy the needs of stanford parser tree = parse_sentence(s) print(type(tree)) tc.numerate_non_terminals(tree) combi = tc.all_parsing_combinations(tree) combi = [string.join(c) for c in combi] #for c in combi : # print c #for c in combi : print c for i, pattern in enumerate(pattern_list) : pattern = re.compile(pattern) #open('combi','w').write(str(combi).replace(',','\n')) for c in combi : match = re.match(pattern,c) if match : print 'match : ', match.group(), ' --> ', c match = match.group() if match : tmp = m2r[i](match,tree) print type(tmp), ' -- ', tmp relations.extend(tmp) print 'relations', relations return set(relations)
def find_realation(text): # HEARST PATTERNS hp1 = 'NP\d+ such as ((NP\d+ ,d+ )+(and |or ))?NP\d+' def m2r_1(match,tree): NPs = re.findall(r'NP\d+', match) NPs = [string.join(tc.get_terminals(tree,NP)) for NP in NPs] return [(NP,'hyponym',NPs[0]) for NP in NPs[1:]] hp2 = 'NP0 VBZ0 NP1 \.0' def m2r_2(match,tree): subject = ' '.join(tc.get_terminals(tree,'NP0')) predicate = ' '.join(tc.get_terminals(tree,'VBZ0')) object_ = ' '.join(tc.get_terminals(tree,'NP1')) return [(subject,predicate,object_)] hp3 = 'NP\d+ is NP\d+' def m2r_3(match,tree): NNs = re.findall(r'NP\d+', match) NNs = [string.join(tc.get_terminals(tree,NN)) for NN in NNs] return [(NN,'hyponym',NNs[0]) for NN in NNs[1:]] #m2r = [ m2r_1, m2r_2, m2r_3 ] # functions to map matches on relations #pattern_list = [hp1,hp2,hp3] m2r = [ m2r_2 ] # functions to map matches on relations pattern_list = [hp2] sentences = split_into_sentences(text) # The next part is rather unclean, but it's hopefully gonna work # Produces a string where every noun phrase is preplaced by NP1,... to NPn # Then a regex search for hearst patterns is applied to find hyponym relations relations = [] for s in sentences: s = format_sentence(s) s+=' .' # adding a fullstop in the end to satisfy the needs of stanford parser tree = parse_sentence(s) tc.numerate_non_terminals(tree) combi = tc.all_parsing_combinations(tree) combi = [string.join(c) for c in combi] #for c in combi : # print c #for c in combi : print c for i, pattern in enumerate(pattern_list) : pattern = re.compile(pattern) #open('combi','w').write(str(combi).replace(',','\n')) for c in combi : match = re.match(pattern,c) if match : print 'match : ', match.group(), ' --> ', c match = match.group() if match : tmp = m2r[i](match,tree) print type(tmp), ' -- ', tmp relations.extend(tmp) print 'relations', relations return set(relations)
#file_name = "./"+file_name """ try: tree = pp.parse_sentence(s,'stat') #numerate_non_terminals(tree) dot_code = utils.nltk_tree_to_dot(tree) print dot_code utils.dot_to_image(dot_code, file_name + '_stat') print except : print 'cannot parse with stat' """ try : tree = pp.parse_sentence(s,'stanford') #numerate_non_terminals(tree) tree = tree[0] dot_code = utils.nltk_tree_to_dot(tree) utils.dot_to_image(dot_code, file_name + '_stanford') #dot_code = utils.list_of_tripels_to_dot(dep) #utils.dot_to_image(dot_code, file_name + '_dep_stanford') print "stanford done" except : print 'cannot parse with stanford' """ try :