def assign_slots(tokens, tag_tree, word_tree): stopword_list = stopwords.words('english') tokens_with_slot_tags = [] word_tree = ParentedTree.convert(word_tree) tag_tree = ParentedTree.convert(tag_tree) word_tree_with_cats = tag_words_with_categories(word_tree) tag_tree_with_cats = tag_words_with_categories(tag_tree) for i, word in enumerate(tokens): tag = finalize_tags(i, word, tag_tree_with_cats, word_tree_with_cats) tokens_with_slot_tags.append((word, tag)) found_query_focus = False for i, item in enumerate(tokens_with_slot_tags): word, tag = item if tag in ['USER','MEDIA','NETWORK'] and not found_query_focus: tokens_with_slot_tags[i] = (word, 'SEARCH') found_query_focus = True elif tag == UNK: tokens_with_slot_tags[i] = (word, 'KEYWORD') slots = {} for word, tag in tokens_with_slot_tags: if tag == 'SKIP': continue elif tag == 'KEYWORD': if 'KEYWORDS' not in slots: slots['KEYWORDS'] = [] if word not in stopword_list and word not in PUNCTUATION: slots['KEYWORDS'].append(word) else: if tag not in slots: slots[tag] = word else: previous_words = slots[tag] slots[tag] = ' '.join([previous_words, word]) return slots
def convert_sentence(input_sent): # Parse sentence using Stanford CoreNLP Parser pos_type = pos_tagger.tag(input_sent.split()) parse_tree, = ParentedTree.convert( list(pos_tagger.parse(input_sent.split()))[0]) dep_type, = ParentedTree.convert(dep_parser.parse(input_sent.split())) return pos_type, parse_tree, dep_type
def get_predicate(self, sub_tree): """ Returns the Verb along with its attributes, Also returns a Verb Phrase """ sub_nodes = [] sub_nodes = sub_tree.subtrees() sub_nodes = [each for each in sub_nodes if each.pos()] predicate = None pred_verb_phrase_siblings = [] sub_tree = ParentedTree.convert(sub_tree) for each in sub_nodes: if each.label() in self.verb_types: sub_tree = each predicate = each.leaves() #get all predicate_verb_phrase_siblings to be able to get the object sub_tree = ParentedTree.convert(sub_tree) if predicate: pred_verb_phrase_siblings = self.tree_root.subtrees() pred_verb_phrase_siblings = [ each for each in pred_verb_phrase_siblings if each.label() in ["NP", "PP", "ADJP", "ADVP"] ] self.pred_verb_phrase_siblings = pred_verb_phrase_siblings return {'predicate': predicate}
def syntax_similarity_two_documents(self, doc1, doc2, average=False): #syntax similarity of two single documents global numnodes doc1sents = self.sent_detector.tokenize(doc1.strip()) doc2sents = self.sent_detector.tokenize(doc2.strip()) for s in doc1sents: # to handle unusual long sentences. if len(s.split())>100: return "NA" for s in doc2sents: if len(s.split())>100: return "NA" try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence. doc1parsed = self.parser.raw_parse_sents((doc1sents)) doc2parsed = self.parser.raw_parse_sents((doc2sents)) except Exception as e: sys.stderr.write(str(e)) return "NA" costMatrix = [] doc1parsed = list(doc1parsed) for i in range(len(doc1parsed)): doc1parsed[i] = list(doc1parsed[i])[0] doc2parsed = list(doc2parsed) for i in range(len(doc2parsed)): doc2parsed[i] = list(doc2parsed[i])[0] for i in range(len(doc1parsed)): numnodes = 0 sentencedoc1 = ParentedTree.convert(doc1parsed[i]) tempnode = Node(sentencedoc1.root().label()) new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(doc2parsed)): numnodes=0.0 sentencedoc2 = ParentedTree.convert(doc2parsed[j]) tempnode = Node(sentencedoc2.root().label()) new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(doc1parsed) colMarked = [0] * len(doc2parsed) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(doc1parsed),len(doc2parsed)) return 1-(total/maxlengraph)
def getSyntaxFeatures(ct): fs = ' '.join([x for x in ct.fullSentence if not re.match('^\s+$', x)]) fs = re.sub('\)', ']', re.sub('\(', '[', fs)) #print('debugging fs:', fs) if len(fs.split()) > 100: return None, None, None, None, None, None # let's not even try pt = None if fs in pm: pt = ParentedTree.convert(pm[fs]) else: try: #sys.stderr.write('Reparsing...\n') tree = lexParser.parse(fs.split()) ptreeiter = ParentedTree.convert(tree) for t in ptreeiter: ptree = t break pt = ParentedTree.convert(ptree) pm[fs] = pt except: # probably memory issue with the parser sys.stderr.write('Skipped during parsing...\n') return None, None, None, None, None, None #print('fs:', fs) #print('pt:', pt) #print('len pt pos:', len(pt.pos())) #print('token:', ct.token) #print('ctstid:', ct.stid) try: node = pt.pos()[ct.stid - 1] nodePosition = pt.leaf_treeposition(ct.stid - 1) parent = pt[nodePosition[:-1]].parent() parentCategory = parent.label() ls = parent.left_sibling() lsCat = False if not ls else ls.label() rs = parent.right_sibling() rsCat = False if not rs else rs.label() rsContainsVP = False if rs: if list(rs.subtrees(filter=lambda x: x.label() == 'VP')): rsContainsVP = True rootRoute = utils.getPathToRoot(parent, []) cRoute = utils.compressRoute([x for x in rootRoute]) return parentCategory, lsCat, rsCat, rsContainsVP, rootRoute, cRoute except IndexError: sys.stderr.write('Skipping due to indexerror...\n') return None, None, None, None, None, None
def add_tree(self, datum): # parse tree and binarize tree = Tree.fromstring(datum["raw_tree"]) tree.chomsky_normal_form() tree.collapse_unary(collapsePOS=True) tree = ParentedTree.convert(tree) # assign indices to subtrees indices = {} counter = 0 for t in tree.subtrees(): indices[t.treeposition()] = counter counter += 1 # generate parent pointers and labels # (labels = one instance of sent in sents by treelstm terminology) parents = [0] * (counter - 1) labels = [] counter = 0 for t in tree.subtrees(): parent = t.parent() if parent != None: parents[counter] = indices[parent.treeposition()] counter += 1 if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0]) self.parents_file.write(" ".join(map(str, parents)) + "\n") self.sents_file.write(" ".join(labels) + "\n") self.trees.append(datum) return len(self.trees) - 1 # ID
def convert_eng_to_isl(input_string): # get all required packages download_required_packages() if len(list(input_string.split(' '))) is 1: return list(input_string.split(' ')) # Initializing stanford parser parser = StanfordParser() # Generates all possible parse trees sort by probability for the sentence possible_parse_tree_list = [tree for tree in parser.parse(input_string.split())] # Get most probable parse tree parse_tree = possible_parse_tree_list[0] print(parse_tree) # output = '(ROOT # (S # (PP (IN As) (NP (DT an) (NN accountant))) # (NP (PRP I)) # (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment)))))) # ) # )' # Convert into tree data structure parent_tree = ParentedTree.convert(parse_tree) modified_parse_tree = modify_tree_structure(parent_tree) parsed_sent = modified_parse_tree.leaves() return parsed_sent
def parse_prereq_text(text): sentences = split_normalize_prereq_text(text) requirements = { 'relationship': 'and', 'children': [] } starting_subject = None invalidated_courses = list() for sentence in sentences: sentence_trees = parser.raw_parse(sentence) tree = ParentedTree.convert(next(sentence_trees)) (sentence_requirements, starting_subject, sentence_invalidated_courses) = parse_requirements(tree, starting_subject) if sentence_invalidated_courses == [-1]: # Sentence invalidates previous course remove_last_course(requirements) else: invalidated_courses += sentence_invalidated_courses if sentence_requirements: requirements['children'].append(sentence_requirements) remove_invalidated_courses(requirements, invalidated_courses) requirements = flatten_relationships(requirements) if requirements: normalize_relationship(requirements) return requirements
def merge_tree_nnps(tree): """ Takes a parse tree and merges any consecutive leaf nodes that come from NNPs For example if there is a segment of: (NP (JJ old) (NNP Pierre) (NNP Vinken) ) Returns: (NP (JJ old) (NNP PierreVinken) ) """ # require a parented tree to get a subtrees tree position p = ParentedTree.convert(tree) # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be for s in p.subtrees(filter=lambda s: s.height() == 3): # merge NNP's in the list representation of this trees children: [(POS, word), ...] new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s]) child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase) # create new subtree with merged NNP's new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str)) # replace old subtree with new subtree p[s.treeposition()] = new_s return Tree.convert(p)
def syntax_similarity_conversation(self, documents1, average=False): #syntax similarity of each document with its before and after document global numnodes documents1parsed = [] for d1 in range(len(documents1)): sys.stderr.write(str(d1)+"\n") # print documents1[d1] tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split())>100: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) results = OrderedDict() for d1 in range(len(documents1parsed)): d2 = d1+1 if d2 == len(documents1parsed): break if documents1parsed[d1] == "NA" or documents1parsed[d2]=="NA": continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents1parsed[d2])): numnodes=0.0 tempnode = Node(documents1parsed[d2][j].root().label()) new_sentencedoc2 = self.convert_mytree(documents1parsed[d2][j],tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(documents1parsed[d1]) colMarked = [0] * len(documents1parsed[d2]) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(documents1parsed[d1]),len(documents1parsed[d2])) results[(d1,d2)] = 1-total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph return results
def _add_conj(self, tmp): result = [] if isinstance(tmp, tuple): flag = tmp[0].split(' ') if len(flag) <= 5: for k in flag: if k in self._dic.keys(): # 把conj補進來 for j in self._dic[k]: if j[0] == 'attr': tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0] tree = ParentedTree.convert(tree) kid = SVONode(('appos', tree), self._data) self._data.child.append(kid) self._queue.append(kid) self._dic[k].remove(j) # a = tmp[0] # b = tmp[1] # result.append((a, b+[j[1]])) else: result.append((j[1], j[2])) if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]: result.append(tmp) result.reverse() return result
def path_enclosed_tree_augmented(fr): if fr.i_sentence != fr.j_sentence: return ParentedTree("None", ["*"]) #just in case else: s_tree = ParentedTree.convert(AUGMENTED_TREES[fr.article][int( fr.i_sentence)]) return _generate_enclosed_tree(fr, s_tree)
def parse_sentences(self, filename, num_sentences): """Parses each one-line sentence into a syntax tree""" # Open the file and parse a given number of sentences f = open(filename, 'r') if num_sentences == 'all': num_sentences = -1 count = 0 for sentence in f.readlines()[:num_sentences]: if count%10==0: print("Number of sentences trained: ",count) # Get possible parse trees trees = self.parser.raw_parse(sentence.lower()) for tree in trees: self.nonterminal_counts['ROOT'] += 1 tokenized_sentence = self.tokenize_sentence(sentence) # Only extract rules from sentences with greater than 8 tokens, # to avoid adding rules that generate short, ungrammatical sentences if len(tokenized_sentence) > 8: self.extract_rules(tree) # Convert the tree into a ParentedTree, # which is an NLTK tree that keeps pointers to each node's parent ptree = ParentedTree.convert(tree) # Calculate the bigram counts for this sentence self.get_bigram(ptree, tokenized_sentence) count+=1
def check(sent): parser = StanfordParser() # Parse the example sentence print(sent) t = list(parser.raw_parse(sent))[0] print(t) t = ParentedTree.convert(t) print(t) t.pretty_print() try: subj = find_subject(t) except: subj = [] try: pred = find_predicate(t) except: pred = [] try: obj = find_object(t) except: obj = [] print(subj) print(pred) print(obj) return subj, pred, obj
def traverse(self, t): t = ParentedTree.convert(t) result = [] cd = '' nns = '' print "subtree = " print t for child in t: print "child = " print child if str(child.label()) == 'CD': cd = child.leaves() if str(child.label()) == 'NNS': nns = child.leaves() if cd != '' and nns != '': print "found pair:" pair = {} pair['cd'] = cd pair['nns'] = nns # stick things in a dictionary print pair result.append(pair) if child.height() > 2: #append the returned dictionary to this dictionary result.extend(self.traverse(child)) return result
def apposition(feats): #this was driving me MAD....I SHOULD CORRECT THE STYLE...aarrrrggghhshs """WORKS WITH THE EXAMPLES IN UNITTEST, HOPE THEY WERE NOT A COINDIDENCE""" if feats.sentence!=feats.sentence_ref: return "apposition={}".format(False) else: sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)] ptree = ParentedTree.convert(sentence_tree) token_ref = set(feats.token_ref.split("_")) token = set(feats.token.split("_")) def is_j_apposition(curr_tree): found = False for child in curr_tree: if found: break elif isinstance(child, ParentedTree): child_leaves = set(child.leaves()) conditions = len(token_ref.intersection(child_leaves))>0 and curr_tree.node == "NP" if conditions: brother = child.left_sibling() if isinstance(brother, ParentedTree) and brother.node == ",": antecedent = brother.left_sibling() if isinstance(antecedent,ParentedTree): previous_words = set(antecedent.leaves()) if len(token.intersection(previous_words))>0: found = True else: found = is_j_apposition(child) return found return "apposition={}".format(is_j_apposition(ptree))
def get_triples(self, sentence): t = list(self.parser.raw_parse(sentence))[0] t = ParentedTree.convert(t) s = self.find_subject(t) p = self.find_predicate(t) o = self.find_object(t) return (s, p, o)
def get_pp_old(text): # Return: a list of prepositions inside PP's in # the text. If the phrase is preceded by a VP/ADJP, the result # include the verb/adj also. If the phrase is preceded by a NP, # the noun is not included. phrases = {} for structure in parser.parse(nltk.word_tokenize(text)): tree = ParentedTree.convert(structure) for subtree in tree.subtrees(): if subtree.label() == "PP": preposition = subtree.leaves()[0] left_sibling = subtree.left_sibling() if left_sibling != None: left_sibling_label = left_sibling.label() if is_noun(left_sibling_label): phrases[preposition] = True elif is_verb(left_sibling_label): verb = convert_to_base_form( " ".join(left_sibling.leaves()), 'v') word = verb + " " + preposition phrases[word] = True elif is_adj(left_sibling_label): adj = convert_to_base_form( " ".join(left_sibling.leaves()), 'a') word = adj + " " + preposition phrases[word] = True return phrases
def get_right_sibling(tree, pos, ct): for i, node in enumerate(tree.pos()): if i == pos: nodepos = tree.leaf_treeposition(i) pt = ParentedTree.convert(tree) rs = pt[nodepos[:-1]].right_sibling() if rs: if rs.label( ) == 'S': # the conn is connecting one or two S-es, take the right sibling S as int arg return rs.leaves() else: parent = pt[nodepos[:-1]].parent() # assuming that there are no duplicates of the connective anymore at this level of detail: leaves = parent.leaves() connindex = leaves.index(ct.token) remainder = [ xj for xi, xj in enumerate(leaves) if xi >= connindex ] return remainder else: # it's on the same level with its arg, which is not an S-clause parent = pt[nodepos[:-1]].parent() right_sibling = parent.right_sibling() leaves = parent.leaves() leaves = leaves + right_sibling.leaves( ) # in this case, it may well be at the end of the clause, in which case the right sibling should probably also be included connindex = leaves.index(ct.token) remainder = [ xj for xi, xj in enumerate(leaves) if xi >= connindex ] return remainder
def j_is_subject(feats): "WORKS" sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)] ptree = ParentedTree.convert(sentence_tree) parent = __get_parent_tree__(feats.token_ref, ptree) j_subject = __is_subject__(ptree,feats.token_ref, parent,ptree) return "j_is_subject={}".format(j_subject)
def is_pred_nominal(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "is_pred_nominal={}".format(False) else: s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)]) NP_i = __get_parent_tree__(feats.token, s_tree) NP_j = __get_parent_tree__(feats.token_ref,s_tree) nominal= __get_max_projection__(s_tree,NP_j) copula_verbs = ["is","are","were","was","am"] def check_nominal_construction(tree): found = False for t in tree: if found: break elif isinstance(t, ParentedTree): if t == NP_i: brother = t.right_sibling() if isinstance(brother,ParentedTree) and brother.node == "VP": verb = brother.leaves()[0] if verb in copula_verbs: for subtree in brother: if subtree == nominal: found = True break else: found = check_nominal_construction(t) return found return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
def is_pred_nominal(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "is_pred_nominal={}".format(False) else: s_tree = ParentedTree.convert( TREES_DICTIONARY[feats.article + ".raw"][int(feats.sentence)]) NP_i = __get_parent_tree__(feats.token, s_tree) NP_j = __get_parent_tree__(feats.token_ref, s_tree) nominal = __get_max_projection__(s_tree, NP_j) copula_verbs = ["is", "are", "were", "was", "am"] def check_nominal_construction(tree): found = False for t in tree: if found: break elif isinstance(t, ParentedTree): if t == NP_i: brother = t.right_sibling() if isinstance(brother, ParentedTree) and brother.node == "VP": verb = brother.leaves()[0] if verb in copula_verbs: for subtree in brother: if subtree == nominal: found = True break else: found = check_nominal_construction(t) return found return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
def __init__(self, parse_tree): self.parse_tree = ParentedTree.convert(tree=parse_tree) self.left_bracket_indices = [] self.right_bracket_indices = [] self.dominant_nodes = ["S", "S+SBAR", "SBAR+S"] self.nonrestrictive_heads = ["because", "since", "after"] self.set_indices() self.result = self.parse_tree.leaves()
def initialRun(text): t = list(parser.raw_parse(text))[0] t = ParentedTree.convert(t) subject_list = copy.deepcopy(findSubject(t)) object_list = copy.deepcopy(findObject(t)) verb_list = copy.deepcopy(findVerb(t)) return subject_list, verb_list, object_list
def j_is_subject(feats): "WORKS" sentence_tree = TREES_DICTIONARY[feats.article + ".raw"][int( feats.sentence_ref)] ptree = ParentedTree.convert(sentence_tree) parent = __get_parent_tree__(feats.token_ref, ptree) j_subject = __is_subject__(ptree, feats.token_ref, parent, ptree) return "j_is_subject={}".format(j_subject)
def Process(str, file): sr = RecursiveDescentParser(pgrammar) r = list(sr.parse(str.split())) if len(r) > 0: cadResult = GenerateCadFile(ParentedTree.convert(r[0])) cadResult.write(file) else: print("************* " + str)
def compute_delta(x, y): k = 0.1 count = 0 ptree = ParentedTree.convert(x) gt_tree = y ptree_gt = ParentedTree.convert(gt_tree) for subtree in ptree.subtrees(): len_tree = 0 for subtree1 in ptree_gt.subtrees(): len_tree = len_tree + 1 if (subtree == subtree1): count = count + 1 delta = (len_tree - count) * k if (delta < 0): delta = 0 return delta
def get_svo(self, sent): t = list(self.parser.raw_parse(sent))[0] t = ParentedTree.convert(t) return { 'Subjects': self.find_subject(t), 'Predicates': self.find_predicate(t), 'Objects': self.find_object(t), 'Sentence': sent }
def parse(text): parser = CoreNLPParser(CORENLP_SERVER) result = parser.raw_parse(text) trees = [tree for tree in result] for tree in trees: tree.chomsky_normal_form() tree.collapse_unary(collapseRoot=True, collapsePOS=True) trees = [ParentedTree.convert(tree) for tree in trees] return trees
def parse(text): parser = CoreNLPParser("http://localhost:9000") result = parser.raw_parse(text.lower()) trees = [tree for tree in result] for tree in trees: tree.chomsky_normal_form() tree.collapse_unary(collapseRoot=True, collapsePOS=True) trees = [ParentedTree.convert(tree) for tree in trees] return trees
def span(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "span={}".format(False) else: s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)]) i_parent = __get_parent_tree__(feats.token, s_tree) j_parent = __get_parent_tree__(feats.token_ref,s_tree) return "span={}".format(i_parent==j_parent)
def getVectorsForTree(tree): treeVectors = [] parentedTree = ParentedTree.convert(tree) for i, node in enumerate(parentedTree.pos()): features = [] currWord = node[0] currPos = node[1] features.append(currWord) features.append(currPos) ln = "SOS" if i == 0 else parentedTree.pos()[i - 1] rn = "EOS" if i == len( parentedTree.pos()) - 1 else parentedTree.pos()[i + 1] lpos = "_" if ln == "SOS" else ln[1] rpos = "_" if rn == "EOS" else rn[1] lstr = ln if ln == "SOS" else ln[0] rstr = rn if rn == "EOS" else rn[0] lbigram = lstr + '_' + currWord rbigram = currWord + '_' + rstr lposbigram = lpos + '_' + currPos rposbigram = currPos + '_' + rpos features.append(lbigram) features.append(lpos) features.append(lposbigram) features.append(rbigram) features.append(rpos) features.append(rposbigram) selfcat = currPos # always POS for single words features.append(selfcat) nodePosition = parentedTree.leaf_treeposition(i) parent = parentedTree[nodePosition[:-1]].parent() parentCategory = parent.label() features.append(parentCategory) ls = parent.left_sibling() lsCat = False if not ls else ls.label() rs = parent.right_sibling() rsCat = False if not rs else rs.label() features.append(lsCat) features.append(rsCat) rsContainsVP = False if rs: if list(rs.subtrees(filter=lambda x: x.label() == 'VP')): rsContainsVP = True # TODO: Figure out how to check if rs contains a trace (given the tree/grammar) features.append(rsContainsVP) #featureList.append(rsContainsTrace) # TODO rootRoute = getPathToRoot(parent, []) features.append('_'.join(rootRoute)) cRoute = compressRoute([x for x in rootRoute]) features.append('_'.join(cRoute)) treeVectors.append(features) return treeVectors
def drop_none(tree): tree = ParentedTree.convert(tree) for sub in reversed(list(tree.subtrees())): if sub.label() == '-NONE-': parent = sub.parent() while parent and len(parent) == 1: sub = parent parent = sub.parent() del tree[sub.treeposition()] return tree
def span(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "span={}".format(False) else: s_tree = ParentedTree.convert( TREES_DICTIONARY[feats.article + ".raw"][int(feats.sentence)]) i_parent = __get_parent_tree__(feats.token, s_tree) j_parent = __get_parent_tree__(feats.token_ref, s_tree) return "span={}".format(i_parent == j_parent)
def get_parent_phrase(tree, pos, labels, ct): for i, node in enumerate(tree.pos()): if i == pos: nodePosition = tree.leaf_treeposition(i) pt = ParentedTree.convert(tree) children = pt[nodePosition[:1]] labelnode = climb_tree(tree, nodePosition, labels) predictedIntArgTokens = labelnode.leaves() return predictedIntArgTokens
def climb_tree(tree, nodePosition, labels): pTree = ParentedTree.convert(tree) parent = pTree[nodePosition[:-1]].parent() if parent.label() in labels or parent.label( ) == 'ROOT': # second condition in case the label I'm looking for is not there return parent else: return climb_tree(tree, nodePosition[:-1], labels)
def pos_to_leaves(tree, leaves, tags): this_tree = ParentedTree.convert(tree=tree) string = ' '.join( str(this_tree).split()) # weird logic to go from tree to string for leaf_index in range(0, len(leaves)): leaf = leaves[leaf_index] tag = tags[leaf_index] old = "(" + tag + " " + tag + ")" new = "(" + tag + " " + leaf + ")" string = string.replace(old, new, 1) return Tree.fromstring(string)
def compute_parent_attribs(subtree, parent, heads_dict): if parent == None: parent_word, parent_word_pos = 'ROOT', 'ROOT' else: parent_head = heads_dict[str(subtree.parent())] parent_head_tree = ParentedTree.convert( Tree.fromstring(parent_head.rstrip())) parent_word, parent_word_pos = parent_head_tree.pos()[0] return (parent_word, parent_word_pos)
def add_indices_to_terminals(ptree): indexed = ParentedTree.convert(ptree) for idx, _ in enumerate(ptree.leaves()): tree_location = ptree.leaf_treeposition(idx) non_terminal = indexed[tree_location[:-1]] if "_" in non_terminal[0]: print('NO! There are underscores in PTB!!!') breakpoint() raise Exception else: non_terminal[0] = non_terminal[0] + "_" + str(idx) return indexed
def leaves_to_pos(self): for tree in self.tree_corpus: this_tree = ParentedTree.convert(tree=tree) leaves = this_tree.leaves() pos_tags = this_tree.pos() string = ' '.join(str( this_tree).split()) # weird logic to go from tree to string for leaf_index in range(0, len(leaves)): leaf = leaves[leaf_index] tag = pos_tags[leaf_index][1] string = string.replace(leaf, tag, 1) self.tree_corpus_pos_leaves.append(Tree.fromstring(string))
def update(self,syntax_tree): ptree = ParentedTree.convert(syntax_tree) bad_words = [":",",",".","?",";"] for leaf in get_leaves(ptree): word = leaf[0] if word in bad_words: continue if not word in self: self.add_node(word,num=0,pos=leaf.pos()[0][1]) self.node[word]["num"] += 1 central_leaf = None for leaf in get_leaves(ptree): if leaf[0] == self.target_word: central_leaf = leaf break if not central_leaf: print "Error: target word not in sentence" for leaf in get_leaves(ptree): word = leaf[0] if word in bad_words: continue if word == self.target_word: for other_leaf in get_leaves(ptree): other_word = other_leaf[0] if word == other_word: continue if other_word in bad_words: continue if not (word,other_word) in self.edges(): self.add_edge(word,other_word,weight=0) self.edge[word][other_word]["weight"] += 1.0/math.sqrt(get_distance(leaf,other_leaf)) else: for other_leaf in get_leaves(ptree): other_word = other_leaf[0] if word == other_word: continue if other_word == self.target_word: continue if other_word in bad_words: continue if not (word,other_word) in self.edges(): self.add_edge(word,other_word,weight=0) self.edge[word][other_word]["weight"] += 1.0/math.pow( get_distance(leaf,other_leaf)* get_distance(leaf,central_leaf)* get_distance(other_leaf,central_leaf),1/float(3) ) self.invalidate_cache()
def same_max_NP(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "same_max_NP={}".format(False) else: sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)] ptree = ParentedTree.convert(sentence_tree) parent1 = __get_parent_tree__(feats.token, ptree) parent2 = __get_parent_tree__(feats.token_ref, ptree) #print "parent of: ", feats.token, ":", parent1 #print "parent of: ", feats.token_ref, ":", parent2 max_p_i = __get_max_projection__(ptree,parent1) max_p_j = __get_max_projection__(ptree, parent2) if max_p_i is not None and max_p_j is not None: both_NPs = max_p_i.node == "NP" and max_p_j.node == "NP" else: both_NPs = False return "same_max_NP={}".format(max_p_i == max_p_j and both_NPs)
def read_treebank_files(files, extractor,fe): """Read the listed treebank files and collect function tagging examples from each tree. The user-provided feature extractor is applied to each phrase in each tree. The extracted feature dicts and the true function tags for each phrase are stored in two separate lists, which are returned. """ X = [] Y = [] for filename in files: scount = 0 for tree in treebank.parsed_sents(filename): tree = ParentedTree.convert(tree) treebank_helper.postprocess(tree) find_examples_in_tree(tree, X, Y, extractor,fe, filename, scount, 0) scount += 1 return X, Y
def parse_sentences(self, filename, num_sentences): """Parse each sentence into a tree""" f = open(filename, 'r') if num_sentences == 'all': num_sentences = -1 count = 0 for sentence in f.readlines()[:num_sentences]: if count%10==0: print(count) trees = self.parser.raw_parse(sentence.lower()) for tree in trees: self.nonterminal_counts['ROOT'] += 1 tokenized_sentence = self.tokenize_sentence(sentence) if len(tokenized_sentence) > 5: self.extract_rules(tree) ptree = ParentedTree.convert(tree) #print(type(ptree)) self.get_bigram(ptree, tokenized_sentence) count+=1
trees.append(parser.parse("a database table stores rows")) trees.append(parser.parse("kitchen table")) trees.append(parser.parse("food on kitchen table")) trees.append(parser.parse("put rows in database table")) trees.append(parser.parse("damaged wooden kitchen table")) trees.append(parser.parse("wooden table")) trees.append(parser.parse("the table that had been forgotten for years became damaged")) trees.append(parser.parse("the database table for our favorite client, Bob's bakery, needs more rows")) trees.append(parser.parse("the database table that stores sensitive client data should be secure")) trees.append(parser.parse("the database table that stores passwords should be secure")) trees.append(parser.parse("Motion to table discussion until later")) trees.append(parser.parse("Please table the motion until a later date")) trees.append(parser.parse("We should table, if not reject, this motion")) for tree in trees: tree = ParentedTree.convert(tree) graphs = [] for tree in trees: g = Graph(query) g.update(tree) graphs.append(g) print "Merging graphs" new_graph = merge_graphs(graphs) print "Drawing graph (fake)" new_graph.draw("new_graph_"+query) print "Getting senses" print new_graph.get_senses() print "Prediction is..." print new_graph.get_predicted_sense(ParentedTree.convert(parser.parse(user_sentence))) #new_graph.print_relatedness_to_target_in_order()
def minweight_edit_distance(self, doc1, doc2): global numnodes doc1sents = self.sent_detector.tokenize(doc1.strip()) doc2sents = self.sent_detector.tokenize(doc2.strip()) doc1parsed = self.parser.raw_parse_sents((doc1sents)) doc2parsed = self.parser.raw_parse_sents((doc2sents)) costMatrix = [] doc1parsed = list(doc1parsed) for i in range(len(doc1parsed)): doc1parsed[i] = list(doc1parsed[i])[0] doc2parsed = list(doc2parsed) for i in range(len(doc2parsed)): doc2parsed[i] = list(doc2parsed[i])[0] for i in range(len(doc1parsed)): numnodes = 0 sentencedoc1 = ParentedTree.convert(doc1parsed[i]) tempnode = Node(sentencedoc1.root().label()) new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(doc2parsed)): numnodes=0.0 sentencedoc2 = ParentedTree.convert(doc2parsed[j]) tempnode = Node(sentencedoc2.root().label()) new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) rownum= costMatrix.shape[0] colnum = costMatrix.shape[1] if rownum > colnum: costMatrixRandom = costMatrix[np.random.randint(rownum, size=colnum),:] else: costMatrixRandom = costMatrix[:,np.random.randint(colnum, size=rownum)] indexes = su.linear_assignment(costMatrix) total = 0 minWeight = 0 rowMarked = [0] * len(doc1parsed) colMarked = [0] * len(doc2parsed) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 minWeight = total for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(doc1parsed),len(doc2parsed)) minlengraph = min(len(doc1parsed),len(doc2parsed)) indexes = su.linear_assignment(costMatrixRandom) randtotal = 0 for row, column in indexes: randtotal +=costMatrixRandom[row][column] lengraph = costMatrixRandom.shape[0] return total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph
def get_arginstances(self, _pbi) : ''' returns a list of ARGInstances given a PropbankInstance and according to self.features Each feature is normalized according to the rules in its if-block. parameters : _pbi PropBankInstance a PropbankInstance that contains the arguments to be extracted return value : list of ARGInstances ''' res = [] for arg in _pbi.arguments : # iterate through all arguments in _pbi argfeatures = {} # predicate feature if 'predicate' in self.features : argfeatures['predicate'] = re.sub(r'(\w+)\..+', r'\1', _pbi.roleset) # lemmatize the predicate and then set # argfeatures['predicate'] = self.wnl.lemmatize(_pbi.predicate.select(_pbi.tree).leaves()[0], "v") # argfeatures['predicate'] = _pbi.predicate.select(_pbi.tree).leaves()[0] # path feature if 'path' in self.features : senTree = ParentedTree.convert(_pbi.tree) argTree = arg[0].select(senTree) predTree = _pbi.predicate.select(senTree) while argTree.label() == "*CHAIN*" or argTree.label() == "*SPLIT*": argTree = argTree[0] while predTree.label() == "*CHAIN*" or predTree.label() == "*SPLIT*": predTree = predTree[0] argParents = [] predParents = [] while predTree != None: predParents.append(predTree) predTree = predTree.parent() while argTree!= None: argParents.append(argTree) argTree = argTree.parent() jointNode = None for node in argParents: if node in predParents: jointNode = node stringPath = "" for i in range(0, argParents.index(jointNode), 1): node = argParents[i] stringPath += re.sub(r"(\w+)-.+", r"\1", node.label()) + "^" for i in range(predParents.index(jointNode) , 0, -1): node = predParents[i] stringPath += re.sub(r"(\w+)-.+", r"\1", node.label()) + "!" argfeatures['path'] = stringPath[:-1] # phraseType feature if 'phraseType' in self.features : argTree = arg[0].select(_pbi.tree) while argTree.label() == "*CHAIN*" or argTree.label() == "*SPLIT*": # traverse tree until a real constituent is found argTree = argTree[0] argfeatures['phraseType'] = re.sub(r"(\w+)[-=$\|].+", r"\1", argTree.label()) # normalize (e.g. NP-SUBJ -> NP) and set # position feature if 'position' in self.features : predTreePointer = _pbi.predicate while not type(predTreePointer) is PropbankTreePointer: # traverse tree while the pointer is not a real constituent predTreePointer = predTreePointer.pieces[0] pred_wordnum = predTreePointer.wordnum # set predicate wordnumber arg_wordnum = None if type(arg[0]) is PropbankTreePointer : arg_wordnum = arg[0].wordnum # PropChainTreePointer and PropSplitTreePointer don't have wordnums and must be traversed elif (type(arg[0]) is PropbankChainTreePointer) or (type(arg[0]) is PropbankSplitTreePointer) : arg_pieces = arg[0].pieces # traverse the tree (always take the left-most subtree) until a PropbankTreePointer is found while type(arg_pieces[0]) is not PropbankTreePointer : arg_pieces = arg_pieces[0].pieces # then get the wordnum arg_wordnum = arg_pieces[0].wordnum # compare wordnumbers and normalize to 'before' or 'after' if arg_wordnum < pred_wordnum : argfeatures['position'] = 'before' else : argfeatures['position'] = 'after' # voice feature if 'voice' in self.features : # extract voice from PropBankInstance-inflection and normalize to 'active', 'passive' and 'NONE' if _pbi.inflection.voice == 'a' : argfeatures['voice'] = 'active' elif _pbi.inflection.voice == 'p' : argfeatures['voice'] = 'passive' else: argfeatures['voice'] = 'NONE' # class feature if 'class' in self.features : argfeatures['class'] = arg[1].split("-")[0] # argfeatures['class'] = re.sub(r'(ARG[0-5])\-\w+', r'\1', arg[1]) res.append(ARGInstance(argfeatures)) # append the initialized ARGInstance to the result return res
for j, job in enumerate(jobs): trees.extend(comm.recv(source=j+1, tag=11)) pickle.dump(trees, open("trees_cache", "w"), pickle.HIGHEST_PROTOCOL) relevant_trees = [] for tree in trees: if query in tree.leaves(): relevant_trees.append(tree) print "Found", len(relevant_trees), "relevant sentences" for tree in relevant_trees: tree = ParentedTree.convert(tree) graphs = [] for tree in relevant_trees: g = Graph(query) g.update(tree) graphs.append(g) print "Merging graphs" num_merges = num_ranks while len(graphs) > 1: jobs = [graphs[i::num_merges] for i in xrange(num_merges)] for j, job in enumerate(jobs): comm.send(job, dest=j+1, tag=11)
# Using the Stanford Parser to get NLTK Parse Trees dataset = open("dataset.txt","r").read() sentences = parser.raw_parse_sents(sent_tokenize(dataset)) for line in sentences: for sentence in line: trees.append(sentence) pickle.dump(trees, open("trees.pkl","wb")) else: trees = pickle.load(open("trees.pkl","rb")) # Converting NLTK trees into parented trees for easy upward traversal global ptrees ptrees = [] for tree in trees: ptrees.append(ParentedTree.convert(tree)) # Function that performs Hobbs Algo def hobbs(ptree, cur_sent): ''' Hobbs Naive Algorithm for Anaphora Resolution ''' #ptree.draw() resolutions = {} highest_S = None for root in ptree.subtrees(): if root.label() == 'S': highest_S = root break pronouns = []
def syntax_similarity_two_lists(self, documents1, documents2, average = False): # synax similarity of two lists of documents global numnodes documents1parsed = [] documents2parsed = [] for d1 in range(len(documents1)): # print d1 tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split())>100: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) for d2 in range(len(documents2)): # print d2 tempsents = (self.sent_detector.tokenize(documents2[d2].strip())) for s in tempsents: if len(s.split())>100: documents2parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents2parsed.append(list(temp)) results ={} for d1 in range(len(documents1parsed)): # print d1 for d2 in range(len(documents2parsed)): # print d1,d2 if documents1parsed[d1]=="NA" or documents2parsed[d2] =="NA": # print "skipped" continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents2parsed[d2])): numnodes=0.0 tempnode = Node(documents2parsed[d2][j].root().label()) new_sentencedoc2 = self.convert_mytree(documents2parsed[d2][j],tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(documents1parsed[d1]) colMarked = [0] * len(documents2parsed[d2]) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(documents1parsed[d1]),len(documents2parsed[d2])) results[(d1,d2)] = 1-total/maxlengraph return results
def path_enclosed_tree_augmented(fr): if fr.i_sentence!=fr.j_sentence: return ParentedTree("None",["*"]) #just in case else: s_tree = ParentedTree.convert(AUGMENTED_TREES[fr.article][int(fr.i_sentence)]) return _generate_enclosed_tree(fr,s_tree)