Ejemplo n.º 1
0
def assign_slots(tokens, tag_tree, word_tree):
    stopword_list = stopwords.words('english')
    tokens_with_slot_tags = []
    word_tree = ParentedTree.convert(word_tree)
    tag_tree = ParentedTree.convert(tag_tree)
    word_tree_with_cats = tag_words_with_categories(word_tree)
    tag_tree_with_cats = tag_words_with_categories(tag_tree)
    for i, word in enumerate(tokens):
        tag = finalize_tags(i, word, tag_tree_with_cats, word_tree_with_cats) 
        tokens_with_slot_tags.append((word, tag))
    found_query_focus = False
    for i, item in enumerate(tokens_with_slot_tags):
        word, tag = item
        if tag in ['USER','MEDIA','NETWORK'] and not found_query_focus:
            tokens_with_slot_tags[i] = (word, 'SEARCH')
            found_query_focus = True
        elif tag == UNK:
            tokens_with_slot_tags[i] = (word, 'KEYWORD')
    slots = {}
    for word, tag in tokens_with_slot_tags:
        if tag == 'SKIP':
            continue
        elif tag == 'KEYWORD':
            if 'KEYWORDS' not in slots:
                slots['KEYWORDS'] = []
            if word not in stopword_list and word not in PUNCTUATION:
                slots['KEYWORDS'].append(word)
        else:
            if tag not in slots:
                slots[tag] = word
            else:
                previous_words = slots[tag]
                slots[tag] = ' '.join([previous_words, word])
    return slots
Ejemplo n.º 2
0
def convert_sentence(input_sent):
    # Parse sentence using Stanford CoreNLP Parser
    pos_type = pos_tagger.tag(input_sent.split())
    parse_tree, = ParentedTree.convert(
        list(pos_tagger.parse(input_sent.split()))[0])
    dep_type, = ParentedTree.convert(dep_parser.parse(input_sent.split()))
    return pos_type, parse_tree, dep_type
Ejemplo n.º 3
0
    def get_predicate(self, sub_tree):
        """
        Returns the Verb along with its attributes, Also returns a Verb Phrase
        """

        sub_nodes = []
        sub_nodes = sub_tree.subtrees()
        sub_nodes = [each for each in sub_nodes if each.pos()]
        predicate = None
        pred_verb_phrase_siblings = []
        sub_tree = ParentedTree.convert(sub_tree)
        for each in sub_nodes:
            if each.label() in self.verb_types:
                sub_tree = each
                predicate = each.leaves()

        #get all predicate_verb_phrase_siblings to be able to get the object
        sub_tree = ParentedTree.convert(sub_tree)
        if predicate:
            pred_verb_phrase_siblings = self.tree_root.subtrees()
            pred_verb_phrase_siblings = [
                each for each in pred_verb_phrase_siblings
                if each.label() in ["NP", "PP", "ADJP", "ADVP"]
            ]
            self.pred_verb_phrase_siblings = pred_verb_phrase_siblings

        return {'predicate': predicate}
Ejemplo n.º 4
0
 def syntax_similarity_two_documents(self, doc1, doc2, average=False): #syntax similarity of two single documents
     global numnodes
     doc1sents = self.sent_detector.tokenize(doc1.strip())
     doc2sents = self.sent_detector.tokenize(doc2.strip())
     for s in doc1sents: # to handle unusual long sentences.
         if len(s.split())>100:
             return "NA"
     for s in doc2sents:
         if len(s.split())>100:
             return "NA"
     try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence.
         doc1parsed = self.parser.raw_parse_sents((doc1sents))
         doc2parsed = self.parser.raw_parse_sents((doc2sents))
     except Exception as e:
         sys.stderr.write(str(e))
         return "NA"
     costMatrix = []
     doc1parsed = list(doc1parsed)
     for i in range(len(doc1parsed)):
         doc1parsed[i] = list(doc1parsed[i])[0]
     doc2parsed = list(doc2parsed)
     for i in range(len(doc2parsed)):
         doc2parsed[i] = list(doc2parsed[i])[0]
     for i in range(len(doc1parsed)):
         numnodes = 0
         sentencedoc1 = ParentedTree.convert(doc1parsed[i])
         tempnode = Node(sentencedoc1.root().label())
         new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode)
         temp_costMatrix = []
         sen1nodes = numnodes
         for j in range(len(doc2parsed)):
             numnodes=0.0
             sentencedoc2 = ParentedTree.convert(doc2parsed[j])
             tempnode = Node(sentencedoc2.root().label())
             new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode)
             ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
             ED = ED / (numnodes + sen1nodes)
             temp_costMatrix.append(ED)
         costMatrix.append(temp_costMatrix)
     costMatrix = np.array(costMatrix)
     if average==True:
         return 1-np.mean(costMatrix)
     else:
         indexes = su.linear_assignment(costMatrix)
         total = 0
         rowMarked = [0] * len(doc1parsed)
         colMarked = [0] * len(doc2parsed)
         for row, column in indexes:
             total += costMatrix[row][column]
             rowMarked[row] = 1
             colMarked [column] = 1
         for k in range(len(rowMarked)):
             if rowMarked[k]==0:
                 total+= np.min(costMatrix[k])
         for c in range(len(colMarked)):
             if colMarked[c]==0:
                 total+= np.min(costMatrix[:,c])
         maxlengraph = max(len(doc1parsed),len(doc2parsed))
         return 1-(total/maxlengraph)
Ejemplo n.º 5
0
def getSyntaxFeatures(ct):

    fs = ' '.join([x for x in ct.fullSentence if not re.match('^\s+$', x)])
    fs = re.sub('\)', ']', re.sub('\(', '[', fs))

    #print('debugging fs:', fs)
    if len(fs.split()) > 100:
        return None, None, None, None, None, None  # let's not even try
    pt = None
    if fs in pm:
        pt = ParentedTree.convert(pm[fs])
    else:
        try:
            #sys.stderr.write('Reparsing...\n')
            tree = lexParser.parse(fs.split())
            ptreeiter = ParentedTree.convert(tree)
            for t in ptreeiter:
                ptree = t
                break
            pt = ParentedTree.convert(ptree)
            pm[fs] = pt
        except:  # probably memory issue with the parser
            sys.stderr.write('Skipped during parsing...\n')
            return None, None, None, None, None, None

    #print('fs:', fs)
    #print('pt:', pt)
    #print('len pt pos:', len(pt.pos()))
    #print('token:', ct.token)
    #print('ctstid:', ct.stid)
    try:
        node = pt.pos()[ct.stid - 1]
        nodePosition = pt.leaf_treeposition(ct.stid - 1)
        parent = pt[nodePosition[:-1]].parent()
        parentCategory = parent.label()

        ls = parent.left_sibling()
        lsCat = False if not ls else ls.label()
        rs = parent.right_sibling()
        rsCat = False if not rs else rs.label()
        rsContainsVP = False
        if rs:
            if list(rs.subtrees(filter=lambda x: x.label() == 'VP')):
                rsContainsVP = True
        rootRoute = utils.getPathToRoot(parent, [])
        cRoute = utils.compressRoute([x for x in rootRoute])
        return parentCategory, lsCat, rsCat, rsContainsVP, rootRoute, cRoute
    except IndexError:
        sys.stderr.write('Skipping due to indexerror...\n')
        return None, None, None, None, None, None
Ejemplo n.º 6
0
    def add_tree(self, datum):
        # parse tree and binarize
        tree = Tree.fromstring(datum["raw_tree"])
        tree.chomsky_normal_form()
        tree.collapse_unary(collapsePOS=True)
        tree = ParentedTree.convert(tree)

        # assign indices to subtrees
        indices = {}
        counter = 0
        for t in tree.subtrees():
            indices[t.treeposition()] = counter
            counter += 1

        # generate parent pointers and labels
        # (labels = one instance of sent in sents by treelstm terminology)
        parents = [0] * (counter - 1)
        labels = []
        counter = 0
        for t in tree.subtrees():
            parent = t.parent()
            if parent != None:
                parents[counter] = indices[parent.treeposition()]
                counter += 1
            if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0])

        self.parents_file.write(" ".join(map(str, parents)) + "\n")
        self.sents_file.write(" ".join(labels) + "\n")
        self.trees.append(datum)
        return len(self.trees) - 1 # ID
def convert_eng_to_isl(input_string):
    # get all required packages
    download_required_packages()

    if len(list(input_string.split(' '))) is 1:
        return list(input_string.split(' '))

    # Initializing stanford parser
    parser = StanfordParser()

    # Generates all possible parse trees sort by probability for the sentence
    possible_parse_tree_list = [tree for tree in parser.parse(input_string.split())]

    # Get most probable parse tree
    parse_tree = possible_parse_tree_list[0]
    print(parse_tree)
    # output = '(ROOT
    #               (S
    #                   (PP (IN As) (NP (DT an) (NN accountant)))
    #                   (NP (PRP I))
    #                   (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))
    #                )
    #             )'

    # Convert into tree data structure
    parent_tree = ParentedTree.convert(parse_tree)

    modified_parse_tree = modify_tree_structure(parent_tree)

    parsed_sent = modified_parse_tree.leaves()
    return parsed_sent
Ejemplo n.º 8
0
def parse_prereq_text(text):
  sentences = split_normalize_prereq_text(text)

  requirements = {
    'relationship': 'and',
    'children': []
  }
  starting_subject = None
  invalidated_courses = list()

  for sentence in sentences:
    sentence_trees = parser.raw_parse(sentence)
    tree = ParentedTree.convert(next(sentence_trees))

    (sentence_requirements,
    starting_subject,
    sentence_invalidated_courses) = parse_requirements(tree, starting_subject)
    if sentence_invalidated_courses == [-1]:
      # Sentence invalidates previous course
      remove_last_course(requirements)
    else:
      invalidated_courses += sentence_invalidated_courses

    if sentence_requirements:
      requirements['children'].append(sentence_requirements)

  remove_invalidated_courses(requirements, invalidated_courses)
  requirements = flatten_relationships(requirements)
  if requirements:
    normalize_relationship(requirements)

  return requirements
Ejemplo n.º 9
0
def merge_tree_nnps(tree):
    """
    Takes a parse tree and merges any consecutive leaf nodes that come from NNPs
    For example if there is a segment of:
        (NP
            (JJ old)
            (NNP Pierre)
            (NNP Vinken)
        )
    Returns:
        (NP
            (JJ old)
            (NNP PierreVinken)
        )
    """

    # require a parented tree to get a subtrees tree position
    p = ParentedTree.convert(tree)

    # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be
    for s in p.subtrees(filter=lambda s: s.height() == 3):
        # merge NNP's in the list representation of this trees children: [(POS, word), ...] 
        new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s])
        child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase)
        # create new subtree with merged NNP's
        new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str))

        # replace old subtree with new subtree
        p[s.treeposition()] = new_s
    return Tree.convert(p)
Ejemplo n.º 10
0
 def syntax_similarity_conversation(self, documents1, average=False): #syntax similarity of each document with its before and after document
     global numnodes
     documents1parsed = []
     for d1 in range(len(documents1)):
         sys.stderr.write(str(d1)+"\n")
         # print documents1[d1]
         tempsents = (self.sent_detector.tokenize(documents1[d1].strip()))
         for s in tempsents:
             if len(s.split())>100:
                 documents1parsed.append("NA")
                 break
         else:
             temp = list(self.parser.raw_parse_sents((tempsents)))
             for i in range(len(temp)):
                 temp[i] = list(temp[i])[0]
                 temp[i] = ParentedTree.convert(temp[i])
             documents1parsed.append(list(temp))
     results = OrderedDict()
     for d1 in range(len(documents1parsed)):
         d2 = d1+1
         if d2 == len(documents1parsed):
             break
         if documents1parsed[d1] == "NA" or documents1parsed[d2]=="NA":
             continue
         costMatrix = []
         for i in range(len(documents1parsed[d1])):
             numnodes = 0
             tempnode = Node(documents1parsed[d1][i].root().label())
             new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode)
             temp_costMatrix = []
             sen1nodes = numnodes
             for j in range(len(documents1parsed[d2])):
                 numnodes=0.0
                 tempnode = Node(documents1parsed[d2][j].root().label())
                 new_sentencedoc2 = self.convert_mytree(documents1parsed[d2][j],tempnode)
                 ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
                 ED = ED / (numnodes + sen1nodes)
                 temp_costMatrix.append(ED)
             costMatrix.append(temp_costMatrix)
         costMatrix = np.array(costMatrix)
         if average==True:
             return 1-np.mean(costMatrix)
         else:
             indexes = su.linear_assignment(costMatrix)
             total = 0
             rowMarked = [0] * len(documents1parsed[d1])
             colMarked = [0] * len(documents1parsed[d2])
             for row, column in indexes:
                 total += costMatrix[row][column]
                 rowMarked[row] = 1
                 colMarked [column] = 1
             for k in range(len(rowMarked)):
                 if rowMarked[k]==0:
                     total+= np.min(costMatrix[k])
             for c in range(len(colMarked)):
                 if colMarked[c]==0:
                     total+= np.min(costMatrix[:,c])
             maxlengraph = max(len(documents1parsed[d1]),len(documents1parsed[d2]))
             results[(d1,d2)] = 1-total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph
     return results
Ejemplo n.º 11
0
    def _add_conj(self, tmp):
        result = []
        if isinstance(tmp, tuple):
            flag = tmp[0].split(' ')
            if len(flag) <= 5:
                for k in flag:
                    if k in self._dic.keys():
                        # 把conj補進來
                        for j in self._dic[k]:
                            if j[0] == 'attr':
                                tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0]
                                tree = ParentedTree.convert(tree)
                                kid = SVONode(('appos', tree), self._data)
                                self._data.child.append(kid)
                                self._queue.append(kid)
                                self._dic[k].remove(j)
#                                 a = tmp[0]
#                                 b = tmp[1]
#                                 result.append((a, b+[j[1]]))
                            else:
                                result.append((j[1], j[2]))

        if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]:
            result.append(tmp)
        result.reverse()
        return result
Ejemplo n.º 12
0
def path_enclosed_tree_augmented(fr):
    if fr.i_sentence != fr.j_sentence:
        return ParentedTree("None", ["*"])  #just in case
    else:
        s_tree = ParentedTree.convert(AUGMENTED_TREES[fr.article][int(
            fr.i_sentence)])
        return _generate_enclosed_tree(fr, s_tree)
 def parse_sentences(self, filename, num_sentences):
     """Parses each one-line sentence into a syntax tree"""
     # Open the file and parse a given number of sentences
     f = open(filename, 'r')
     if num_sentences == 'all':
         num_sentences = -1
     count = 0
     for sentence in f.readlines()[:num_sentences]:
         if count%10==0:
             print("Number of sentences trained: ",count)
         # Get possible parse trees
         trees = self.parser.raw_parse(sentence.lower())
         for tree in trees:
             self.nonterminal_counts['ROOT'] += 1
             tokenized_sentence = self.tokenize_sentence(sentence)
             # Only extract rules from sentences with greater than 8 tokens,
             # to avoid adding rules that generate short, ungrammatical sentences
             if len(tokenized_sentence) > 8:
                 self.extract_rules(tree)
             # Convert the tree into a ParentedTree, 
             # which is an NLTK tree that keeps pointers to each node's parent
             ptree = ParentedTree.convert(tree)
             # Calculate the bigram counts for this sentence
             self.get_bigram(ptree, tokenized_sentence)
         count+=1
Ejemplo n.º 14
0
def check(sent):

    parser = StanfordParser()

    # Parse the example sentence

    print(sent)
    t = list(parser.raw_parse(sent))[0]
    print(t)
    t = ParentedTree.convert(t)
    print(t)
    t.pretty_print()
    try:
        subj = find_subject(t)
    except:
        subj = []
    try:
        pred = find_predicate(t)
    except:
        pred = []
    try:
        obj = find_object(t)
    except:
        obj = []

    print(subj)
    print(pred)
    print(obj)
    return subj, pred, obj
Ejemplo n.º 15
0
 def traverse(self, t):
     t = ParentedTree.convert(t)
     result = []
     cd = ''
     nns = ''
     print "subtree = "
     print t
     for child in t:
         print "child = "
         print child
         if str(child.label()) == 'CD':
             cd = child.leaves()
         if str(child.label()) == 'NNS':
             nns = child.leaves()
         if cd != '' and nns != '':
             print "found pair:"
             pair = {}
             pair['cd'] = cd
             pair['nns'] = nns
             # stick things in a dictionary
             print pair
             result.append(pair)
         if child.height() > 2:
             #append the returned dictionary to this dictionary
             result.extend(self.traverse(child))
     return result
def apposition(feats): #this was driving me MAD....I SHOULD CORRECT THE STYLE...aarrrrggghhshs
    """WORKS WITH THE EXAMPLES IN UNITTEST, HOPE THEY WERE NOT A COINDIDENCE"""
    if feats.sentence!=feats.sentence_ref:
        return "apposition={}".format(False)
    else:
        sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)]
        ptree = ParentedTree.convert(sentence_tree)
        token_ref = set(feats.token_ref.split("_"))
        token = set(feats.token.split("_"))
        def is_j_apposition(curr_tree):
                found = False
                for child in curr_tree:
                    if found:
                        break
                    elif isinstance(child, ParentedTree):
                        child_leaves = set(child.leaves())
                        conditions = len(token_ref.intersection(child_leaves))>0 and curr_tree.node == "NP"
                        if conditions:
                            brother = child.left_sibling()
                            if isinstance(brother, ParentedTree) and brother.node == ",":
                                antecedent = brother.left_sibling()
                                if isinstance(antecedent,ParentedTree):
                                    previous_words = set(antecedent.leaves())
                                    if len(token.intersection(previous_words))>0:
                                        found = True
                        else:
                            found = is_j_apposition(child)

                return found
        return "apposition={}".format(is_j_apposition(ptree))
Ejemplo n.º 17
0
 def get_triples(self, sentence):
     t = list(self.parser.raw_parse(sentence))[0]
     t = ParentedTree.convert(t)
     s = self.find_subject(t)
     p = self.find_predicate(t)
     o = self.find_object(t)
     return (s, p, o)
Ejemplo n.º 18
0
def get_pp_old(text):
    # Return: a list of prepositions inside PP's in
    # the text. If the phrase is preceded by a VP/ADJP, the result
    # include the verb/adj also. If the phrase is preceded by a NP,
    # the noun is not included.
    phrases = {}

    for structure in parser.parse(nltk.word_tokenize(text)):
        tree = ParentedTree.convert(structure)
        for subtree in tree.subtrees():
            if subtree.label() == "PP":
                preposition = subtree.leaves()[0]
                left_sibling = subtree.left_sibling()

                if left_sibling != None:
                    left_sibling_label = left_sibling.label()
                    if is_noun(left_sibling_label):
                        phrases[preposition] = True
                    elif is_verb(left_sibling_label):
                        verb = convert_to_base_form(
                            " ".join(left_sibling.leaves()), 'v')
                        word = verb + " " + preposition
                        phrases[word] = True
                    elif is_adj(left_sibling_label):
                        adj = convert_to_base_form(
                            " ".join(left_sibling.leaves()), 'a')
                        word = adj + " " + preposition
                        phrases[word] = True

    return phrases
Ejemplo n.º 19
0
def get_right_sibling(tree, pos, ct):

    for i, node in enumerate(tree.pos()):
        if i == pos:
            nodepos = tree.leaf_treeposition(i)
            pt = ParentedTree.convert(tree)
            rs = pt[nodepos[:-1]].right_sibling()
            if rs:
                if rs.label(
                ) == 'S':  # the conn is connecting one or two S-es, take the right sibling S as int arg
                    return rs.leaves()
                else:
                    parent = pt[nodepos[:-1]].parent()
                    # assuming that there are no duplicates of the connective anymore at this level of detail:
                    leaves = parent.leaves()
                    connindex = leaves.index(ct.token)
                    remainder = [
                        xj for xi, xj in enumerate(leaves) if xi >= connindex
                    ]
                    return remainder
            else:  # it's on the same level with its arg, which is not an S-clause
                parent = pt[nodepos[:-1]].parent()
                right_sibling = parent.right_sibling()
                leaves = parent.leaves()
                leaves = leaves + right_sibling.leaves(
                )  # in this case, it may well be at the end of the clause, in which case the right sibling should probably also be included
                connindex = leaves.index(ct.token)
                remainder = [
                    xj for xi, xj in enumerate(leaves) if xi >= connindex
                ]
                return remainder
def j_is_subject(feats):
    "WORKS"
    sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)]
    ptree = ParentedTree.convert(sentence_tree)
    parent = __get_parent_tree__(feats.token_ref, ptree)
    j_subject = __is_subject__(ptree,feats.token_ref, parent,ptree)
    return "j_is_subject={}".format(j_subject)
def is_pred_nominal(feats):
    """WORKS"""
    if feats.sentence != feats.sentence_ref:
        return "is_pred_nominal={}".format(False)
    else:
        s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)])
        NP_i = __get_parent_tree__(feats.token, s_tree)
        NP_j = __get_parent_tree__(feats.token_ref,s_tree)
        nominal= __get_max_projection__(s_tree,NP_j)
        copula_verbs = ["is","are","were","was","am"]
        def check_nominal_construction(tree):
            found = False
            for t in tree:
                if found:
                    break
                elif isinstance(t, ParentedTree):
                    if t == NP_i:
                        brother = t.right_sibling()
                        if isinstance(brother,ParentedTree) and brother.node == "VP":
                            verb = brother.leaves()[0]
                            if verb in copula_verbs:
                                for subtree in brother:
                                    if subtree == nominal:
                                        found = True
                                        break
                    else:
                        found = check_nominal_construction(t)
            return found

        return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
Ejemplo n.º 22
0
def is_pred_nominal(feats):
    """WORKS"""
    if feats.sentence != feats.sentence_ref:
        return "is_pred_nominal={}".format(False)
    else:
        s_tree = ParentedTree.convert(
            TREES_DICTIONARY[feats.article + ".raw"][int(feats.sentence)])
        NP_i = __get_parent_tree__(feats.token, s_tree)
        NP_j = __get_parent_tree__(feats.token_ref, s_tree)
        nominal = __get_max_projection__(s_tree, NP_j)
        copula_verbs = ["is", "are", "were", "was", "am"]

        def check_nominal_construction(tree):
            found = False
            for t in tree:
                if found:
                    break
                elif isinstance(t, ParentedTree):
                    if t == NP_i:
                        brother = t.right_sibling()
                        if isinstance(brother,
                                      ParentedTree) and brother.node == "VP":
                            verb = brother.leaves()[0]
                            if verb in copula_verbs:
                                for subtree in brother:
                                    if subtree == nominal:
                                        found = True
                                        break
                    else:
                        found = check_nominal_construction(t)
            return found

        return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
Ejemplo n.º 23
0
 def __init__(self, parse_tree):
     self.parse_tree = ParentedTree.convert(tree=parse_tree)
     self.left_bracket_indices = []
     self.right_bracket_indices = []
     self.dominant_nodes = ["S", "S+SBAR", "SBAR+S"]
     self.nonrestrictive_heads = ["because", "since", "after"]
     self.set_indices()
     self.result = self.parse_tree.leaves()
Ejemplo n.º 24
0
def initialRun(text):
    t = list(parser.raw_parse(text))[0]
    t = ParentedTree.convert(t)
    subject_list = copy.deepcopy(findSubject(t))
    object_list = copy.deepcopy(findObject(t))
    verb_list = copy.deepcopy(findVerb(t))

    return subject_list, verb_list, object_list
Ejemplo n.º 25
0
def j_is_subject(feats):
    "WORKS"
    sentence_tree = TREES_DICTIONARY[feats.article + ".raw"][int(
        feats.sentence_ref)]
    ptree = ParentedTree.convert(sentence_tree)
    parent = __get_parent_tree__(feats.token_ref, ptree)
    j_subject = __is_subject__(ptree, feats.token_ref, parent, ptree)
    return "j_is_subject={}".format(j_subject)
Ejemplo n.º 26
0
def Process(str, file):
    sr = RecursiveDescentParser(pgrammar)
    r = list(sr.parse(str.split()))
    if len(r) > 0:
        cadResult = GenerateCadFile(ParentedTree.convert(r[0]))
        cadResult.write(file)
    else:
        print("************* " + str)
Ejemplo n.º 27
0
def compute_delta(x, y):
    k = 0.1
    count = 0
    ptree = ParentedTree.convert(x)
    gt_tree = y
    ptree_gt = ParentedTree.convert(gt_tree)

    for subtree in ptree.subtrees():
        len_tree = 0
        for subtree1 in ptree_gt.subtrees():
            len_tree = len_tree + 1
            if (subtree == subtree1):
                count = count + 1

    delta = (len_tree - count) * k
    if (delta < 0):
        delta = 0
    return delta
Ejemplo n.º 28
0
 def get_svo(self, sent):
     t = list(self.parser.raw_parse(sent))[0]
     t = ParentedTree.convert(t)
     return {
         'Subjects': self.find_subject(t),
         'Predicates': self.find_predicate(t),
         'Objects': self.find_object(t),
         'Sentence': sent
     }
Ejemplo n.º 29
0
Archivo: tree.py Proyecto: tTeha/MRMARS
def parse(text):
    parser = CoreNLPParser(CORENLP_SERVER)
    result = parser.raw_parse(text)
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees
Ejemplo n.º 30
0
def parse(text):
    parser = CoreNLPParser("http://localhost:9000")
    result = parser.raw_parse(text.lower())
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees
def span(feats):
    """WORKS"""
    if feats.sentence != feats.sentence_ref:
        return "span={}".format(False)
    else:
        s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)])
        i_parent = __get_parent_tree__(feats.token, s_tree)
        j_parent = __get_parent_tree__(feats.token_ref,s_tree)
        return "span={}".format(i_parent==j_parent)
Ejemplo n.º 32
0
def getVectorsForTree(tree):

    treeVectors = []
    parentedTree = ParentedTree.convert(tree)
    for i, node in enumerate(parentedTree.pos()):
        features = []
        currWord = node[0]
        currPos = node[1]
        features.append(currWord)
        features.append(currPos)
        ln = "SOS" if i == 0 else parentedTree.pos()[i - 1]
        rn = "EOS" if i == len(
            parentedTree.pos()) - 1 else parentedTree.pos()[i + 1]
        lpos = "_" if ln == "SOS" else ln[1]
        rpos = "_" if rn == "EOS" else rn[1]
        lstr = ln if ln == "SOS" else ln[0]
        rstr = rn if rn == "EOS" else rn[0]
        lbigram = lstr + '_' + currWord
        rbigram = currWord + '_' + rstr
        lposbigram = lpos + '_' + currPos
        rposbigram = currPos + '_' + rpos
        features.append(lbigram)
        features.append(lpos)
        features.append(lposbigram)
        features.append(rbigram)
        features.append(rpos)
        features.append(rposbigram)

        selfcat = currPos  # always POS for single words
        features.append(selfcat)
        nodePosition = parentedTree.leaf_treeposition(i)
        parent = parentedTree[nodePosition[:-1]].parent()
        parentCategory = parent.label()
        features.append(parentCategory)

        ls = parent.left_sibling()
        lsCat = False if not ls else ls.label()
        rs = parent.right_sibling()
        rsCat = False if not rs else rs.label()
        features.append(lsCat)
        features.append(rsCat)
        rsContainsVP = False
        if rs:
            if list(rs.subtrees(filter=lambda x: x.label() == 'VP')):
                rsContainsVP = True
        # TODO: Figure out how to check if rs contains a trace (given the tree/grammar)
        features.append(rsContainsVP)
        #featureList.append(rsContainsTrace) # TODO
        rootRoute = getPathToRoot(parent, [])
        features.append('_'.join(rootRoute))
        cRoute = compressRoute([x for x in rootRoute])
        features.append('_'.join(cRoute))

        treeVectors.append(features)

    return treeVectors
Ejemplo n.º 33
0
def drop_none(tree):
    tree = ParentedTree.convert(tree)
    for sub in reversed(list(tree.subtrees())):
        if sub.label() == '-NONE-':
            parent = sub.parent()
            while parent and len(parent) == 1:
                sub = parent
                parent = sub.parent()
            del tree[sub.treeposition()]
    return tree
Ejemplo n.º 34
0
def span(feats):
    """WORKS"""
    if feats.sentence != feats.sentence_ref:
        return "span={}".format(False)
    else:
        s_tree = ParentedTree.convert(
            TREES_DICTIONARY[feats.article + ".raw"][int(feats.sentence)])
        i_parent = __get_parent_tree__(feats.token, s_tree)
        j_parent = __get_parent_tree__(feats.token_ref, s_tree)
        return "span={}".format(i_parent == j_parent)
Ejemplo n.º 35
0
def get_parent_phrase(tree, pos, labels, ct):

    for i, node in enumerate(tree.pos()):
        if i == pos:
            nodePosition = tree.leaf_treeposition(i)
            pt = ParentedTree.convert(tree)
            children = pt[nodePosition[:1]]
            labelnode = climb_tree(tree, nodePosition, labels)
            predictedIntArgTokens = labelnode.leaves()
            return predictedIntArgTokens
Ejemplo n.º 36
0
def climb_tree(tree, nodePosition, labels):

    pTree = ParentedTree.convert(tree)
    parent = pTree[nodePosition[:-1]].parent()
    if parent.label() in labels or parent.label(
    ) == 'ROOT':  # second condition in case the label I'm looking for is not there
        return parent

    else:
        return climb_tree(tree, nodePosition[:-1], labels)
Ejemplo n.º 37
0
 def pos_to_leaves(tree, leaves, tags):
     this_tree = ParentedTree.convert(tree=tree)
     string = ' '.join(
         str(this_tree).split())  # weird logic to go from tree to string
     for leaf_index in range(0, len(leaves)):
         leaf = leaves[leaf_index]
         tag = tags[leaf_index]
         old = "(" + tag + " " + tag + ")"
         new = "(" + tag + " " + leaf + ")"
         string = string.replace(old, new, 1)
     return Tree.fromstring(string)
Ejemplo n.º 38
0
def compute_parent_attribs(subtree, parent, heads_dict):
    if parent == None:
        parent_word, parent_word_pos = 'ROOT', 'ROOT'
    else:

        parent_head = heads_dict[str(subtree.parent())]

        parent_head_tree = ParentedTree.convert(
            Tree.fromstring(parent_head.rstrip()))
        parent_word, parent_word_pos = parent_head_tree.pos()[0]
    return (parent_word, parent_word_pos)
Ejemplo n.º 39
0
def add_indices_to_terminals(ptree):
    indexed = ParentedTree.convert(ptree)
    for idx, _ in enumerate(ptree.leaves()):
        tree_location = ptree.leaf_treeposition(idx)
        non_terminal = indexed[tree_location[:-1]]
        if "_" in non_terminal[0]:
            print('NO! There are underscores in PTB!!!')
            breakpoint()
            raise Exception
        else:
            non_terminal[0] = non_terminal[0] + "_" + str(idx)
    return indexed
Ejemplo n.º 40
0
 def leaves_to_pos(self):
     for tree in self.tree_corpus:
         this_tree = ParentedTree.convert(tree=tree)
         leaves = this_tree.leaves()
         pos_tags = this_tree.pos()
         string = ' '.join(str(
             this_tree).split())  # weird logic to go from tree to string
         for leaf_index in range(0, len(leaves)):
             leaf = leaves[leaf_index]
             tag = pos_tags[leaf_index][1]
             string = string.replace(leaf, tag, 1)
         self.tree_corpus_pos_leaves.append(Tree.fromstring(string))
Ejemplo n.º 41
0
	def update(self,syntax_tree):
		ptree = ParentedTree.convert(syntax_tree)
		bad_words = [":",",",".","?",";"]
		for leaf in get_leaves(ptree):
			word = leaf[0]
			if word in bad_words:
				continue
			if not word in self:
				self.add_node(word,num=0,pos=leaf.pos()[0][1])
			self.node[word]["num"] += 1
		central_leaf = None
		for leaf in get_leaves(ptree):
			if leaf[0] == self.target_word:
				central_leaf = leaf
				break
		if not central_leaf:
			print "Error: target word not in sentence"
		for leaf in get_leaves(ptree):
			word = leaf[0]
			if word in bad_words:
				continue
			if word == self.target_word:
				for other_leaf in get_leaves(ptree):
					other_word = other_leaf[0]
					if word == other_word:
						continue
					if other_word in bad_words:
						continue
					if not (word,other_word) in self.edges():
						self.add_edge(word,other_word,weight=0)
					self.edge[word][other_word]["weight"] += 1.0/math.sqrt(get_distance(leaf,other_leaf))
			else:
				for other_leaf in get_leaves(ptree):
					other_word = other_leaf[0]
					if word == other_word:
						continue
					if other_word == self.target_word:
						continue
					if other_word in bad_words:
						continue
					if not (word,other_word) in self.edges():
						self.add_edge(word,other_word,weight=0)
					self.edge[word][other_word]["weight"] += 1.0/math.pow(
						get_distance(leaf,other_leaf)*
						get_distance(leaf,central_leaf)*
						get_distance(other_leaf,central_leaf),1/float(3)
						)
		self.invalidate_cache()
def same_max_NP(feats):
    """WORKS"""
    if feats.sentence !=  feats.sentence_ref:
        return "same_max_NP={}".format(False)
    else:
        sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)]
        ptree = ParentedTree.convert(sentence_tree)
        parent1 = __get_parent_tree__(feats.token, ptree)
        parent2 = __get_parent_tree__(feats.token_ref, ptree)
        #print "parent of: ", feats.token, ":", parent1
        #print "parent of: ", feats.token_ref, ":", parent2
        max_p_i = __get_max_projection__(ptree,parent1)
        max_p_j = __get_max_projection__(ptree, parent2)
        if max_p_i is not None and max_p_j is not None:
            both_NPs = max_p_i.node == "NP" and max_p_j.node == "NP"
        else:
            both_NPs = False
        return "same_max_NP={}".format(max_p_i == max_p_j and both_NPs)
Ejemplo n.º 43
0
def read_treebank_files(files, extractor,fe):
    """Read the listed treebank files and collect function tagging examples
    from each tree.

    The user-provided feature extractor is applied to each phrase in each 
    tree. The extracted feature dicts and the true function tags for each
    phrase are stored in two separate lists, which are returned.
    """
    X = []
    Y = []
    for filename in files:
        scount = 0
        for tree in treebank.parsed_sents(filename):
            tree = ParentedTree.convert(tree)
            treebank_helper.postprocess(tree)
            find_examples_in_tree(tree, X, Y, extractor,fe, filename, scount, 0)
            scount += 1
    return X, Y
 def parse_sentences(self, filename, num_sentences):
     """Parse each sentence into a tree"""
     f = open(filename, 'r')
     if num_sentences == 'all':
         num_sentences = -1
     count = 0
     for sentence in f.readlines()[:num_sentences]:
         if count%10==0:
             print(count)
         trees = self.parser.raw_parse(sentence.lower())
         for tree in trees:
             self.nonterminal_counts['ROOT'] += 1
             tokenized_sentence = self.tokenize_sentence(sentence)
             if len(tokenized_sentence) > 5:
                 self.extract_rules(tree)
             ptree = ParentedTree.convert(tree)
             #print(type(ptree))
             self.get_bigram(ptree, tokenized_sentence)
         count+=1
trees.append(parser.parse("a database table stores rows"))
trees.append(parser.parse("kitchen table"))
trees.append(parser.parse("food on kitchen table"))
trees.append(parser.parse("put rows in database table"))
trees.append(parser.parse("damaged wooden kitchen table"))
trees.append(parser.parse("wooden table"))
trees.append(parser.parse("the table that had been forgotten for years became damaged"))
trees.append(parser.parse("the database table for our favorite client, Bob's bakery, needs more rows"))
trees.append(parser.parse("the database table that stores sensitive client data should be secure"))
trees.append(parser.parse("the database table that stores passwords should be secure"))
trees.append(parser.parse("Motion to table discussion until later"))
trees.append(parser.parse("Please table the motion until a later date"))
trees.append(parser.parse("We should table, if not reject, this motion"))


for tree in trees:
	tree = ParentedTree.convert(tree)
graphs = []
for tree in trees:
	g = Graph(query)
	g.update(tree)
	graphs.append(g)
print "Merging graphs"
new_graph = merge_graphs(graphs)
print "Drawing graph (fake)"
new_graph.draw("new_graph_"+query)
print "Getting senses"
print new_graph.get_senses()
print "Prediction is..."
print new_graph.get_predicted_sense(ParentedTree.convert(parser.parse(user_sentence)))
#new_graph.print_relatedness_to_target_in_order()
Ejemplo n.º 46
0
 def minweight_edit_distance(self, doc1, doc2):
     global numnodes
     doc1sents = self.sent_detector.tokenize(doc1.strip())
     doc2sents = self.sent_detector.tokenize(doc2.strip())
     doc1parsed = self.parser.raw_parse_sents((doc1sents))
     doc2parsed = self.parser.raw_parse_sents((doc2sents))
     costMatrix = []
     doc1parsed = list(doc1parsed)
     for i in range(len(doc1parsed)):
         doc1parsed[i] = list(doc1parsed[i])[0]
     doc2parsed = list(doc2parsed)
     for i in range(len(doc2parsed)):
             doc2parsed[i] = list(doc2parsed[i])[0]
     for i in range(len(doc1parsed)):
         numnodes = 0
         sentencedoc1 = ParentedTree.convert(doc1parsed[i])
         tempnode = Node(sentencedoc1.root().label())
         new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode)
         temp_costMatrix = []
         sen1nodes = numnodes
         for j in range(len(doc2parsed)):
             numnodes=0.0
             sentencedoc2 = ParentedTree.convert(doc2parsed[j])
             tempnode = Node(sentencedoc2.root().label())
             new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode)
             ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
             ED = ED / (numnodes + sen1nodes)
             temp_costMatrix.append(ED)
         costMatrix.append(temp_costMatrix)
     costMatrix = np.array(costMatrix)
     rownum= costMatrix.shape[0]
     colnum = costMatrix.shape[1]
     if rownum > colnum:
         costMatrixRandom = costMatrix[np.random.randint(rownum, size=colnum),:]
     else:
         costMatrixRandom = costMatrix[:,np.random.randint(colnum, size=rownum)]
 
     indexes = su.linear_assignment(costMatrix)
     total = 0
     minWeight = 0
     rowMarked = [0] * len(doc1parsed)
     colMarked = [0] * len(doc2parsed)
     for row, column in indexes:
         total += costMatrix[row][column]
         rowMarked[row] = 1
         colMarked [column] = 1
     minWeight = total
 
     for k in range(len(rowMarked)):
         if rowMarked[k]==0:
             total+= np.min(costMatrix[k])
     for c in range(len(colMarked)):
         if colMarked[c]==0:
             total+= np.min(costMatrix[:,c])
     maxlengraph = max(len(doc1parsed),len(doc2parsed))
     minlengraph = min(len(doc1parsed),len(doc2parsed))
 
     indexes = su.linear_assignment(costMatrixRandom)
     randtotal = 0
     for row, column in indexes:
         randtotal +=costMatrixRandom[row][column]
     lengraph = costMatrixRandom.shape[0]
 
     return total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph
	def get_arginstances(self, _pbi) :
		'''
			returns a list of ARGInstances given a PropbankInstance and according to self.features
			
			Each feature is normalized according to the rules in its if-block.
			
			parameters :
				_pbi PropBankInstance
					a PropbankInstance that contains the arguments to be extracted
			return value :
				list of ARGInstances
		'''
		res = []
		for arg in _pbi.arguments : # iterate through all arguments in _pbi
			argfeatures = {}
			
			# predicate feature
			if 'predicate' in self.features :
				argfeatures['predicate'] = re.sub(r'(\w+)\..+', r'\1', _pbi.roleset) # lemmatize the predicate and then set
				# argfeatures['predicate'] = self.wnl.lemmatize(_pbi.predicate.select(_pbi.tree).leaves()[0], "v")
				# argfeatures['predicate'] = _pbi.predicate.select(_pbi.tree).leaves()[0]
			
			# path feature
			if 'path' in self.features :
				senTree = ParentedTree.convert(_pbi.tree)
				argTree = arg[0].select(senTree)
				predTree = _pbi.predicate.select(senTree)
				while argTree.label() == "*CHAIN*" or argTree.label() == "*SPLIT*":
					argTree = argTree[0]
				while predTree.label() == "*CHAIN*" or predTree.label() == "*SPLIT*":					
					predTree = predTree[0]
				
				argParents = []
				predParents = []
				while predTree != None:
					predParents.append(predTree)					
					predTree = predTree.parent()
					
				while argTree!= None:
					argParents.append(argTree)
					argTree = argTree.parent()
					
				jointNode = None
				for node in argParents:
					if node in predParents:
						jointNode = node
							
				stringPath = ""
				for i in range(0, argParents.index(jointNode), 1):	 
					node = argParents[i]
					stringPath += re.sub(r"(\w+)-.+", r"\1", node.label()) + "^"
				
				for i in range(predParents.index(jointNode) , 0, -1):
					node = predParents[i]
					stringPath += re.sub(r"(\w+)-.+", r"\1", node.label()) + "!"
				argfeatures['path'] = stringPath[:-1]
			
			# phraseType feature
			if 'phraseType' in self.features :
				argTree = arg[0].select(_pbi.tree)
				while argTree.label() == "*CHAIN*" or argTree.label() == "*SPLIT*": # traverse tree until a real constituent is found
					argTree = argTree[0]
				argfeatures['phraseType'] = re.sub(r"(\w+)[-=$\|].+", r"\1", argTree.label()) # normalize (e.g. NP-SUBJ -> NP) and set
			
			# position feature
			if 'position' in self.features :
				predTreePointer = _pbi.predicate
				while not type(predTreePointer) is PropbankTreePointer: # traverse tree while the pointer is not a real constituent
					predTreePointer = predTreePointer.pieces[0] 
				pred_wordnum = predTreePointer.wordnum # set predicate wordnumber
				arg_wordnum = None
				if type(arg[0]) is PropbankTreePointer :
					arg_wordnum = arg[0].wordnum
				# PropChainTreePointer and PropSplitTreePointer don't have wordnums and must be traversed
				elif (type(arg[0]) is PropbankChainTreePointer) or (type(arg[0]) is PropbankSplitTreePointer) :
					arg_pieces = arg[0].pieces
					# traverse the tree (always take the left-most subtree) until a PropbankTreePointer is found
					while type(arg_pieces[0]) is not PropbankTreePointer :
						arg_pieces = arg_pieces[0].pieces
					# then get the wordnum
					arg_wordnum = arg_pieces[0].wordnum
				# compare wordnumbers and normalize to 'before' or 'after'
				if arg_wordnum < pred_wordnum :
					argfeatures['position'] = 'before'
				else :
					argfeatures['position'] = 'after'
					
			# voice feature
			if 'voice' in self.features :
				# extract voice from PropBankInstance-inflection and normalize to 'active', 'passive' and 'NONE'
				if _pbi.inflection.voice == 'a' :
					argfeatures['voice'] = 'active'
				elif _pbi.inflection.voice == 'p' :
					argfeatures['voice'] = 'passive'
				else:
					argfeatures['voice'] = 'NONE'
			
			# class feature
			if 'class' in self.features :
				argfeatures['class'] = arg[1].split("-")[0]
				# argfeatures['class'] = re.sub(r'(ARG[0-5])\-\w+', r'\1', arg[1])
			
			res.append(ARGInstance(argfeatures)) # append the initialized ARGInstance to the result
		return res
        for j, job in enumerate(jobs):
            trees.extend(comm.recv(source=j+1, tag=11))

        pickle.dump(trees, open("trees_cache", "w"), pickle.HIGHEST_PROTOCOL)

    relevant_trees = []

    for tree in trees:
        if query in tree.leaves():
            relevant_trees.append(tree)

    print "Found", len(relevant_trees), "relevant sentences"

    for tree in relevant_trees:
        tree = ParentedTree.convert(tree)
    graphs = []
    for tree in relevant_trees:
        g = Graph(query)
        g.update(tree)
        graphs.append(g)
    

    print "Merging graphs"

    num_merges = num_ranks

    while len(graphs) > 1:
        jobs = [graphs[i::num_merges] for i in xrange(num_merges)]
        for j, job in enumerate(jobs):
            comm.send(job, dest=j+1, tag=11)
Ejemplo n.º 49
0
	# Using the Stanford Parser to get NLTK Parse Trees
	dataset = open("dataset.txt","r").read()
	sentences = parser.raw_parse_sents(sent_tokenize(dataset))
	for line in sentences:
		for sentence in line:
			trees.append(sentence)
	
	pickle.dump(trees, open("trees.pkl","wb"))
		
else:
	trees = pickle.load(open("trees.pkl","rb"))
	# Converting NLTK trees into parented trees for easy upward traversal
global ptrees
ptrees = []
for tree in trees:
	ptrees.append(ParentedTree.convert(tree))

# Function that performs Hobbs Algo
def hobbs(ptree, cur_sent):
	'''
		Hobbs Naive Algorithm for Anaphora Resolution
	'''
	#ptree.draw()
	resolutions = {}
	highest_S = None
	for root in ptree.subtrees():
		if root.label() == 'S':
			highest_S = root
			break
			
	pronouns = []	
Ejemplo n.º 50
0
    def syntax_similarity_two_lists(self, documents1, documents2, average = False): # synax similarity of two lists of documents
        global numnodes
        documents1parsed = []
        documents2parsed = []

        for d1 in range(len(documents1)):
            # print d1
            tempsents = (self.sent_detector.tokenize(documents1[d1].strip()))
            for s in tempsents:
                if len(s.split())>100:
                    documents1parsed.append("NA")
                    break
            else:
                temp = list(self.parser.raw_parse_sents((tempsents)))
                for i in range(len(temp)):
                    temp[i] = list(temp[i])[0]
                    temp[i] = ParentedTree.convert(temp[i])
                documents1parsed.append(list(temp))
        for d2 in range(len(documents2)):
            # print d2
            tempsents = (self.sent_detector.tokenize(documents2[d2].strip()))
            for s in tempsents:
                if len(s.split())>100:
                    documents2parsed.append("NA")
                    break
            else:
                temp = list(self.parser.raw_parse_sents((tempsents)))
                for i in range(len(temp)):
                    temp[i] = list(temp[i])[0]
                    temp[i] = ParentedTree.convert(temp[i])
                documents2parsed.append(list(temp))
        results ={}
        for d1 in range(len(documents1parsed)):
            # print d1
            for d2 in range(len(documents2parsed)):
                # print d1,d2
                if documents1parsed[d1]=="NA" or documents2parsed[d2] =="NA":
                    # print "skipped"
                    continue
                costMatrix = []
                for i in range(len(documents1parsed[d1])):
                    numnodes = 0
                    tempnode = Node(documents1parsed[d1][i].root().label())
                    new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode)
                    temp_costMatrix = []
                    sen1nodes = numnodes
                    for j in range(len(documents2parsed[d2])):
                        numnodes=0.0
                        tempnode = Node(documents2parsed[d2][j].root().label())
                        new_sentencedoc2 = self.convert_mytree(documents2parsed[d2][j],tempnode)
                        ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
                        ED = ED / (numnodes + sen1nodes)
                        temp_costMatrix.append(ED)
                    costMatrix.append(temp_costMatrix)
                costMatrix = np.array(costMatrix)
                if average==True:
                    return 1-np.mean(costMatrix)
                else:
                    indexes = su.linear_assignment(costMatrix)
                    total = 0
                    rowMarked = [0] * len(documents1parsed[d1])
                    colMarked = [0] * len(documents2parsed[d2])
                    for row, column in indexes:
                        total += costMatrix[row][column]
                        rowMarked[row] = 1
                        colMarked [column] = 1
                    for k in range(len(rowMarked)):
                        if rowMarked[k]==0:
                            total+= np.min(costMatrix[k])
                    for c in range(len(colMarked)):
                        if colMarked[c]==0:
                            total+= np.min(costMatrix[:,c])
                    maxlengraph = max(len(documents1parsed[d1]),len(documents2parsed[d2]))
                    results[(d1,d2)] = 1-total/maxlengraph
        return results
def path_enclosed_tree_augmented(fr):
    if fr.i_sentence!=fr.j_sentence:
        return ParentedTree("None",["*"]) #just in case
    else:
        s_tree = ParentedTree.convert(AUGMENTED_TREES[fr.article][int(fr.i_sentence)])
        return _generate_enclosed_tree(fr,s_tree)