Beispiel #1
0
def merge(treesFile,graphFile):
    trees = load_depTrees_from_file(treesFile)
    graphsFromFile = load_depGraphs_from_file(graphFile)
#     assert(len(trees)==len(graphsFromFile))
#     print "len ok"
    graphs = []
    for i,t in enumerate(trees):
        curGraph,nodesMap = graphsFromFile[i]
        curGraph.wsj_id = t[0].wsj_id
        curGraph.sent_id = t[0].sent_id
        curGraph.originalSentence = t[0].original_sentence
        curGraph.wsj_id = t[0].wsj_id
        curGraph.sent_id = t[0].sent_id
        curGraph.tree_str = "\n".join(t[0].to_original_format().split("\n")[1:])
        for node_id in nodesMap:
            int_node_id = int(node_id.split("'")[0])
            treeNode = t[int_node_id]
            child_dic = treeNode._get_child_dic()
            if 'cc' in child_dic:
                conj_type = (" ".join([cc.word for cc in sorted(child_dic['cc'],key=lambda cc:cc.id)]),[cc.id for cc in child_dic['cc']])
            else:
                conj_type = False
            graphNodes = [nodesMap[n] for n in nodesMap if n.split("'")[0] == node_id]
            for graphNode in graphNodes:
                graphNode.features = get_verbal_features(treeNode)
                if conj_type:
                    graphNode.features["conjType"] = conj_type
                graphNode.features["pos"]=treeNode.pos
                graphNode.isPredicate = treeNode.is_verbal_predicate()
#                 graphNode.original_text = copy.copy(graphNode.text)
                graphNode.original_text = treeNode.get_text()
                graphNode.surface_form += missing_children(treeNode,graphNode)
        curGraph.del_node(nodesMap['0']) # delete root
        graphs.append(curGraph)   
    return graphs
Beispiel #2
0
    def parseVerbal(self, indexes, verbs, arguments, tree):
        """
        add a verbal subgraph to the graph
        
        @type  indexes: list [int]
        @param indexes: the index(es) of the verb in the sentence
        
        @type  verbs: list [string] 
        @param verbs: the string(s) representing the verb
        
        @type tree: DepTree
        @param tree: tree object from which to extract various features
        
        @type  arguments: list 
        @param arguments: list of DepTrees of arguments
        """

        # create verbal head node
        # start by extracting features
        feats = syntactic_item.get_verbal_features(tree)
        if feats['Lemma'] == verbs[0]:
            del (feats['Lemma'])

        for k in feats:
            self.types.add(k)

        verbNode = graph_representation.node.Node(
            isPredicate=True,
            text=[
                Word(index=index, word=verb)
                for index, verb in zip(indexes, verbs)
            ],
            features=feats,
            valid=True)
        self.gr.add_node(verbNode)

        # handle arguments
        for arg_t in arguments:
            curNode = self.parse(arg_t)
            #curNode.features = syntactic_item.get_verbal_features(arg_t)
            self.gr.add_edge((verbNode, curNode), arg_t.parent_relation)

        # handle time expressions
        (timeSubtree, _) = tree._VERBAL_PREDICATE_SUBTREE_Time()
        if timeSubtree:
            timeNode = graph_representation.node.TimeNode.init(features={})
            self.gr.add_node(timeNode)
            timeSubGraph = self.parse(timeSubtree)
            self.gr.add_edge((verbNode, timeNode))
            self.gr.add_edge((timeNode, timeSubGraph))

        return verbNode
Beispiel #3
0
def treeNode_to_graphNode(treeNode, gr):
    """
    @type treeNode DepTree
    """

    feats = get_verbal_features(treeNode)
    ret = newNode.Node(text=[Word(index=treeNode.id, word=treeNode.word)],
                       isPredicate=treeNode.is_verbal_predicate(),
                       features=feats,
                       gr=gr)
    ret.features["pos"] = treeNode.pos
    ret.original_text = copy(ret.text)
    return ret
Beispiel #4
0
 def parseVerbal(self,indexes,verbs,arguments,tree):
     """
     add a verbal subgraph to the graph
     
     @type  indexes: list [int]
     @param indexes: the index(es) of the verb in the sentence
     
     @type  verbs: list [string] 
     @param verbs: the string(s) representing the verb
     
     @type tree: DepTree
     @param tree: tree object from which to extract various features
     
     @type  arguments: list 
     @param arguments: list of DepTrees of arguments
     """
     
     # create verbal head node
     # start by extracting features
     feats = syntactic_item.get_verbal_features(tree)
     if feats['Lemma'] == verbs[0]:
         del(feats['Lemma'])
     
     for k in feats:
         self.types.add(k)
         
         
     verbNode = graph_representation.node.Node(isPredicate=True,
                                               text = [Word(index=index,
                                                            word=verb) for index,verb in zip(indexes,verbs)],
                                               features=feats,
                                               valid=True)
     self.gr.add_node(verbNode)
     
     # handle arguments
     for arg_t in arguments:
         curNode = self.parse(arg_t)
         #curNode.features = syntactic_item.get_verbal_features(arg_t)
         self.gr.add_edge((verbNode,curNode), arg_t.parent_relation)
     
     
     # handle time expressions
     (timeSubtree,_) = tree._VERBAL_PREDICATE_SUBTREE_Time()
     if timeSubtree:
         timeNode = graph_representation.node.TimeNode.init(features = {})
         self.gr.add_node(timeNode)
         timeSubGraph = self.parse(timeSubtree)
         self.gr.add_edge((verbNode,timeNode))
         self.gr.add_edge((timeNode,timeSubGraph))
         
     return verbNode 
Beispiel #5
0
def treeNode_to_graphNode(treeNode,gr):
    """
    @type treeNode DepTree
    """
    
    feats = get_verbal_features(treeNode)
    ret = newNode.Node(text = [Word(index=treeNode.id,
                                    word = treeNode.word)],
                       isPredicate = treeNode.is_verbal_predicate(),
                       features = feats,
                       gr = gr)
    ret.features["pos"] = treeNode.pos
    ret.original_text = copy(ret.text)
    return ret
Beispiel #6
0
def read_dep_graphs_file(constituency_tree_fn,
                         wsjInfo_exists=False,
                         HOME_DIR="./",
                         stanford_json_sent=None):


    stream = convert_json_to_dep_graph(stanford_json_sent) \
             if stanford_json_sent \
                else convert_to_dep_graph(constituency_tree_fn)

    graphsFromFile = create_dep_graphs_from_stream(stream, HOME_DIR)
    trees = read_trees_file(constituency_tree_fn, False, stanford_json_sent)
    graphs = []
    for i, t in enumerate(trees):
        curGraph, nodesMap = graphsFromFile[i]
        curGraph.originalSentence = t[0].original_sentence
        curGraph.tree_str = "\n".join(
            t[0].to_original_format().split("\n")[1:])
        curGraph.dep_tree = t
        for node_id in nodesMap:
            int_node_id = int(node_id.split("'")[0])
            treeNode = t[int_node_id]
            child_dic = treeNode._get_child_dic()
            if 'cc' in child_dic:
                conj_type = (" ".join([
                    cc.word
                    for cc in sorted(child_dic['cc'], key=lambda cc: cc.id)
                ]), [cc.id for cc in child_dic['cc']])
            else:
                conj_type = False
            graphNodes = [
                nodesMap[n] for n in nodesMap if n.split("'")[0] == node_id
            ]
            for graphNode in graphNodes:
                graphNode.features = get_verbal_features(treeNode)
                if conj_type:
                    graphNode.features["conjType"] = conj_type
                graphNode.features["pos"] = treeNode.pos
                graphNode.isPredicate = treeNode.is_verbal_predicate()
                #                 graphNode.original_text = copy.copy(graphNode.text)
                graphNode.original_text = treeNode.get_text()
                graphNode.surface_form += missing_children(treeNode, graphNode)
        curGraph.del_node(nodesMap['0'])  # delete root
        graphs.append(curGraph)
    return graphs
Beispiel #7
0
def merge(treesFile, graphFile):
    trees = load_depTrees_from_file(treesFile)
    graphsFromFile = load_depGraphs_from_file(graphFile)
    #     assert(len(trees)==len(graphsFromFile))
    #     print "len ok"
    graphs = []
    for i, t in enumerate(trees):
        curGraph, nodesMap = graphsFromFile[i]
        curGraph.wsj_id = t[0].wsj_id
        curGraph.sent_id = t[0].sent_id
        curGraph.originalSentence = t[0].original_sentence
        curGraph.wsj_id = t[0].wsj_id
        curGraph.sent_id = t[0].sent_id
        curGraph.tree_str = "\n".join(
            t[0].to_original_format().split("\n")[1:])
        for node_id in nodesMap:
            int_node_id = int(node_id.split("'")[0])
            treeNode = t[int_node_id]
            child_dic = treeNode._get_child_dic()
            if 'cc' in child_dic:
                conj_type = (" ".join([
                    cc.word
                    for cc in sorted(child_dic['cc'], key=lambda cc: cc.id)
                ]), [cc.id for cc in child_dic['cc']])
            else:
                conj_type = False
            graphNodes = [
                nodesMap[n] for n in nodesMap if n.split("'")[0] == node_id
            ]
            for graphNode in graphNodes:
                graphNode.features = get_verbal_features(treeNode)
                if conj_type:
                    graphNode.features["conjType"] = conj_type
                graphNode.features["pos"] = treeNode.pos
                graphNode.isPredicate = treeNode.is_verbal_predicate()
                #                 graphNode.original_text = copy.copy(graphNode.text)
                graphNode.original_text = treeNode.get_text()
                graphNode.surface_form += missing_children(treeNode, graphNode)
        curGraph.del_node(nodesMap['0'])  # delete root
        graphs.append(curGraph)
    return graphs
Beispiel #8
0
    def parse(self,t):
        """
        Get the graph representation from a syntactic representation
        Returns through the graph parameter.
        
        @type  t: DepTree
        @param tree: syntactic tree to be converted
        
        @rtype: Node
        @return: the node in the graph corresponding to the top node in t
        """
        
        #order matters!
        if t.is_conditional_predicate():
            self.types.add(APPENDIX_COND)
            return self.parseConditional(outcome = t._CONDITIONAL_PREDICATE_FEATURE_Outcome()["Value"],
                                         condList = t.condPred)

        
        if t._VERBAL_PREDICATE_SUBTREE_Adv():
            advChildren = t.adverb_children
            advSubj = t.adverb_subj
            return self.parseAdverb(subj=advSubj, 
                             advChildren=advChildren)
        
        if t.is_conjunction_predicate():
            self.types.add(APPENDIX_CONJUNCTION)
            return self.parseConjunction(baseElm = t.baseElm,
                                         conjResult = t.conjResult)
        
        if t.is_appositional_predicate():
            self.types.add(APPENDIX_APPOS)
            firstEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Left_Side()["Value"]
            secondEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Right_Side()["Value"]
            return self.parseApposition(index = t.id,
                                        first_entity=firstEntity,
                                        second_entity=secondEntity)
        
        if t.is_relative_clause():
            self.types.add(APPENDIX_RCMOD)
            return self.parseRcmod(np = t._RELCLAUSE_PREDICATE_FEATURE_Rest()['Value'], 
                                   modList = t.rcmodPred)
        
        if t.is_prepositional_predicate():
            self.types.add(APPENDIX_PREP)
            return self.parsePreposition(psubj=t._PREPOSITIONAL_PREDICATE_FEATURE_psubj()["Value"],
                                          prepChildList=t.prepChildList)
                    
        if t.is_copular_predicate():
            self.types.add(APPENDIX_COP)
            firstEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Predicate()["Value"]
            secondEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Object()["Value"]
            return self.parseCopular(index = t.id,
                                     first_entity=firstEntity,
                                     second_entity=secondEntity,
                                     features = syntactic_item.get_verbal_features(t))
        
        if t.is_possesive_predicate():
            self.types.add(APPENDIX_POSS)
            possessor = t._POSSESSIVE_PREDICATE_FEATURE_Possessor()["Value"]
            possessed = t._POSSESSIVE_PREDICATE_FEATURE_Possessed()["Value"]
            possessive = t._POSSESSIVE_PREDICATE_FEATURE_Possessive()["Value"]
            return self.parsePossessive(possessor = possessor, 
                                        possessed = possessed,
                                        possessive = possessive)
        
            
        if t.is_adjectival_predicate():
            self.types.add(APPENDIX_ADJ)
            return self.parseProp(subject = t._ADJECTIVAL_PREDICATE_FEATURE_Subject()["Value"],
                                  copulaIndex = NO_INDEX,
                                  adjectiveChildList = t.adjectivalChildList,
                                  propAsHead=False)
            
        if t.is_clausal_complement():
            self.types.add(APPENDIX_COMPLEMENT)
            return self.parseComplement(compSubj = t.compSubj,
                                        compChildren = t.compChildList)
        
        if t.unhandled_advcl():
            # put each unhandled advcl as a disconnected subgraph
            for c in t.advcl:
                self.parse(c)
            return self.parse(t)
        
        if t.is_verbal_predicate():
            self.types.add(APPENDIX_VERB)
            head_ret = t._VERBAL_PREDICATE_SUBTREE_Head()
            return self.parseVerbal(indexes = head_ret["Span"],
                             verbs = head_ret["Value"].split(" "),
                             arguments = t.collect_arguments(),
                             tree = t)
        
            
        
        else:
            # fall back - pack all the tree in a single node
            if len(t.children)==1:
                if (t.children[0].parent_relation == "nn") and (t.word.endswith(",")) and (t.children[0].word.endswith(",")):
                    #conjunction in disguise
                    child = t.children[0]
                    t.children = []
                    ret =  self.parseConjunction(cc = [(t.id,"and")], 
                                                conjElements = [t,child])
                    t.children = [child]
                    return ret
            
            nodes = t._get_subtree(filter_labels_ban)
            text = [Word(index=index,
                         word=nodes[index]) for index in sorted(nodes.keys())] 
            topNode = self.parseBottom(text = sorted(text,key=lambda x:x.index),
                        features = syntactic_item.get_verbal_features(t))

            return topNode
Beispiel #9
0
    def parse(self,t):
        """
        Get the graph representation from a syntactic representation
        Returns through the graph parameter.
        
        @type  t: DepTree
        @param tree: syntactic tree to be converted
        
        @rtype: Node
        @return: the node in the graph corresponding to the top node in t
        """
        
        #order matters!
        if t.is_conditional_predicate():
            self.types.add(APPENDIX_COND)
            return self.parseConditional(outcome = t._CONDITIONAL_PREDICATE_FEATURE_Outcome()["Value"],
                                         condList = t.condPred)

        
        if t._VERBAL_PREDICATE_SUBTREE_Adv():
            advChildren = t.adverb_children
            advSubj = t.adverb_subj
            return self.parseAdverb(subj=advSubj, 
                             advChildren=advChildren)
        
        if t.is_conjunction_predicate():
            self.types.add(APPENDIX_CONJUNCTION)
            return self.parseConjunction(baseElm = t.baseElm,
                                         conjResult = t.conjResult)
        
        if t.is_appositional_predicate():
            self.types.add(APPENDIX_APPOS)
            firstEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Left_Side()["Value"]
            secondEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Right_Side()["Value"]
            return self.parseApposition(index = t.id,
                                        first_entity=firstEntity,
                                        second_entity=secondEntity)
        
        if t.is_relative_clause():
            self.types.add(APPENDIX_RCMOD)
            return self.parseRcmod(np = t._RELCLAUSE_PREDICATE_FEATURE_Rest()['Value'], 
                                   modList = t.rcmodPred)
        
        if t.is_prepositional_predicate():
            self.types.add(APPENDIX_PREP)
            return self.parsePreposition(psubj=t._PREPOSITIONAL_PREDICATE_FEATURE_psubj()["Value"],
                                          prepChildList=t.prepChildList)
                    
        if t.is_copular_predicate():
            self.types.add(APPENDIX_COP)
            firstEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Predicate()["Value"]
            secondEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Object()["Value"]
            return self.parseCopular(index = t.id,
                                     first_entity=firstEntity,
                                     second_entity=secondEntity,
                                     features = syntactic_item.get_verbal_features(t))
        
        if t.is_possesive_predicate():
            self.types.add(APPENDIX_POSS)
            possessor = t._POSSESSIVE_PREDICATE_FEATURE_Possessor()["Value"]
            possessed = t._POSSESSIVE_PREDICATE_FEATURE_Possessed()["Value"]
            possessive = t._POSSESSIVE_PREDICATE_FEATURE_Possessive()["Value"]
            return self.parsePossessive(possessor = possessor, 
                                        possessed = possessed,
                                        possessive = possessive)
        
            
        if t.is_adjectival_predicate():
            self.types.add(APPENDIX_ADJ)
            return self.parseProp(subject = t._ADJECTIVAL_PREDICATE_FEATURE_Subject()["Value"],
                                  copulaIndex = NO_INDEX,
                                  adjectiveChildList = t.adjectivalChildList,
                                  propAsHead=False)
            
        if t.is_clausal_complement():
            self.types.add(APPENDIX_COMPLEMENT)
            return self.parseComplement(compSubj = t.compSubj,
                                        compChildren = t.compChildList)
        
        if t.unhandled_advcl():
            # put each unhandled advcl as a disconnected subgraph
            for c in t.advcl:
                self.parse(c)
            return self.parse(t)
        
        if t.is_verbal_predicate():
            self.types.add(APPENDIX_VERB)
            head_ret = t._VERBAL_PREDICATE_SUBTREE_Head()
            return self.parseVerbal(indexes = head_ret["Span"],
                             verbs = head_ret["Value"].split(" "),
                             arguments = t.collect_arguments(),
                             tree = t)
        
            
        
        else:
            # fall back - pack all the tree in a single node
            if len(t.children)==1:
                if (t.children[0].parent_relation == "nn") and (t.word.endswith(",")) and (t.children[0].word.endswith(",")):
                    #conjunction in disguise
                    child = t.children[0]
                    t.children = []
                    ret =  self.parseConjunction(cc = [(t.id,"and")], 
                                                conjElements = [t,child])
                    t.children = [child]
                    return ret
            
            nodes = t._get_subtree(filter_labels_ban)
            text = [Word(index=index,
                         word=nodes[index]) for index in sorted(nodes.keys())] 
            topNode = self.parseBottom(text = sorted(text,key=lambda x:x.index),
                        features = syntactic_item.get_verbal_features(t))

            return topNode