Exemple #1
0
 def coordPath(this,target):
     if depG.has_node(this) and depG.has_node(target):
         paths1 = [NX.shortest_path(depG,x,this)
                   for x in parentTokens
                   if depG.has_node(x)]
         paths2 = [NX.shortest_path(depG,x,target)
                   for x in parentTokens
                   if depG.has_node(x)]
         # same coordination group if there is a pair of
         # shortest paths from event that start with the same edge
         # and this edge is not between the tokens of the event
         start1 = set( [depG[p[0]][p[1]]['type'] for p in paths1
                        if (p and
                            len(p)>=2 and
                            not p[1] in parentTokens)] )
         start2 = set( [depG[p[0]][p[1]]['type'] for p in paths2
                        if (p and
                            len(p)>=2 and
                            not p[1] in parentTokens)] )
         return(start1.intersection(start2))
     return(False)
Exemple #2
0
 def coordPath(this, target):
     if depG.has_node(this) and depG.has_node(target):
         paths1 = [
             NX.shortest_path(depG, x, this) for x in parentTokens
             if depG.has_node(x)
         ]
         paths2 = [
             NX.shortest_path(depG, x, target) for x in parentTokens
             if depG.has_node(x)
         ]
         # same coordination group if there is a pair of
         # shortest paths from event that start with the same edge
         # and this edge is not between the tokens of the event
         start1 = set([
             depG[p[0]][p[1]]['type'] for p in paths1
             if (p and len(p) >= 2 and not p[1] in parentTokens)
         ])
         start2 = set([
             depG[p[0]][p[1]]['type'] for p in paths2
             if (p and len(p) >= 2 and not p[1] in parentTokens)
         ])
         return (start1.intersection(start2))
     return (False)
Exemple #3
0
    def analyse(self):
        """
        Analyses the parse, determines the unflattening strategy
        for each event, and modifies the cache accordingly.
        """
        def getCoordGrouping(edges):
            """
            Workhorse function for determining the grouping of arguments.

            @type edges: list of 3-tuples
            @param edges: out-going edges of an event to be grouped
            @rtype: list of lists of 3-tuples
            @return: grouped edges
            """
            def coordPath(this, target):
                if depG.has_node(this) and depG.has_node(target):
                    paths1 = [
                        NX.shortest_path(depG, x, this) for x in parentTokens
                        if depG.has_node(x)
                    ]
                    paths2 = [
                        NX.shortest_path(depG, x, target) for x in parentTokens
                        if depG.has_node(x)
                    ]
                    # same coordination group if there is a pair of
                    # shortest paths from event that start with the same edge
                    # and this edge is not between the tokens of the event
                    start1 = set([
                        depG[p[0]][p[1]]['type'] for p in paths1
                        if (p and len(p) >= 2 and not p[1] in parentTokens)
                    ])
                    start2 = set([
                        depG[p[0]][p[1]]['type'] for p in paths2
                        if (p and len(p) >= 2 and not p[1] in parentTokens)
                    ])
                    return (start1.intersection(start2))
                return (False)

            def connected(e1, e2):
                # do not continue if not within sentence
                if self.mapping.has_key(e1):
                    for t1 in self.mapping[e1]:
                        # do not continue if not within sentence
                        if self.mapping.has_key(e2):
                            for t2 in self.mapping[e2]:
                                if coordPath(t1, t2):
                                    return (True)
                return (False)

            if not edges:
                return ([])
            # where does the parent node belong to?
            sentence = self.sentences[Analyser.findSentenceId(edges[0][0])]
            depG = self.depGs[sentence]
            edgemap = dict([(x[2].attrib['e2'], x) for x in edges])
            connG = NX.Graph()
            for x in edgemap.values():
                connG.add_node(x)
            pid = edges[0][2].attrib['e1']
            parentTokens = []
            if self.mapping.has_key(pid):
                parentTokens = self.mapping[pid]
            for e1 in edgemap.keys():
                for e2 in edgemap.keys():
                    if not e1 == e2:
                        if connected(e1, e2):
                            connG.add_edge(edgemap[e1], edgemap[e2])
            return (NX.connected_components(connG))

        def getGrouping(node):
            """
            Generates the edge groups describing the unflattened event.

            @type node: cElementTree.Element
            @param node: event to be unflattened
            @rtype: list of lists of 3-tuples
            @return: grouped edges
            """
            uid = node.attrib['id']
            t = node.attrib['type']
            # 'neg' edges are not considered and will be
            # removed from the final xml
            edges = [(x[0], x[1], x[2]['xmlnode'])
                     for x in self.semG.out_edges(node, True)
                     if not x[2]['xmlnode'].attrib['type'] == 'neg']
            if t in [
                    'Gene_expression', 'Transcription', 'Translation',
                    'Protein_catabolism'
            ]:
                result = [[e] for e in edges]
                #                #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
                #                                                                     len(result),
                #                                                                     [[y[2].attrib['id']
                #                                                                       for y in x]
                #                                                                      for x in result]))
                return (result)
            elif t == 'Localization':
                result = [[e] for e in edges]
                #                #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
                #                                                                     len(result),
                #                                                                     [[y[2].attrib['id']
                #                                                                       for y in x]
                #                                                                      for x in result]))
                return (result)
            elif t == 'Binding':
                # Binding is not perfectly solvable
                if self.perfect:
                    #sys.stderr.write("Skipping %s (%s)\n"%(uid,t))
                    return ([edges])
                groups = getCoordGrouping(edges)
                if len(groups) == 1:
                    # data suggests that regardless of number of members
                    # in the group, the binding should be split
                    # (cases of >2 are very rare)
                    # (the decision to split events with 2 members is about
                    #  1:1 but splitting is still slightly favoured)
                    result = [[e] for e in edges]
                    #                    #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
                    #                                                                         len(result),
                    #                                                                         [[y[2].attrib['id']
                    #                                                                           for y in x]
                    #                                                                          for x in result]))
                    return (result)
                else:
                    # two groups should be split to pairwise combinations
                    # (can 'respectively' be ignored?)
                    # events with more than two proteins are rare
                    # so three or more groups should be treated in a
                    # pairwise manner
                    result = []
                    while groups:
                        g1 = groups.pop()
                        result.extend([(e1, e2) for g2 in groups for e1 in g1
                                       for e2 in g2])
                    #sys.stderr.write("Generating inter-group pairs for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result]))
                    return (result)
            elif t == 'Phosphorylation':
                result = [[e] for e in edges]
                #                #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
                #                                                                     len(result),
                #                                                                     [[y[2].attrib['id']
                #                                                                       for y in x]
                #                                                                      for x in result]))
                return (result)
            elif t in [
                    'Regulation', 'Positive_regulation', 'Negative_regulation'
            ]:
                # Regulation is not perfectly solvable
                # but for now there no better way than the baseline
                # (can 'respectively' be ignored?)
                cause = [
                    x for x in edges if x[2].attrib['type'].startswith('Cause')
                ]
                theme = [
                    x for x in edges if x[2].attrib['type'].startswith('Theme')
                ]
                if cause and theme:
                    result = [(ca, th) for ca in cause for th in theme]
                    #sys.stderr.write("Generating Cause-Theme combinations for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result]))
                    return (result)
                else:
                    result = [[e] for e in edges]
                    #                    #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
                    #                                                                       len(result),
                    #                                                                       [[y[2].attrib['id']
                    #                                                                         for y in x]
                    #                                                                        for x in result]))
                    return (result)
            elif t in ['Protein']:
                # Proteins have sites (etc.) as successors
                # these edges will be processed later --> do nothing at this point
                result = [edges]
            else:
                if t != "Entity":
                    sys.stderr.write("Invalid event type: %s\n" % t)
            return ([edges])

        # unflatten the graph in the cache
        counter = Increment()
        unprocessed_nodes = set(
            [x for x in self.semG.nodes() if not self.semG.out_edges(x)])
        while unprocessed_nodes:
            next_nodes = set()
            for current in unprocessed_nodes:
                next_nodes.update(set(self.semG.predecessors(current)))
                if self.semG.out_edges(current):
                    groups = getGrouping(current)
                    for edges in groups:
                        evid = counter.get()
                        newN = ET.Element('entity', current.attrib)
                        newId = newN.attrib['id'] + '.E' + evid
                        newN.attrib['id'] = newId
                        self.semG.add_node(newN)
                        for e in edges:
                            newE = ET.Element('interaction', e[2].attrib)
                            newEid = newE.attrib['id'] + '.E' + evid
                            newE.attrib['id'] = newEid
                            newE.attrib['e1'] = newId
                            self.semG.add_edge(newN, e[1], xmlnode=newE)
                        for e in self.semG.in_edges(current, True):
                            newE = ET.Element('interaction',
                                              e[2]['xmlnode'].attrib)
                            newEid = newE.attrib['id'] + '.E' + evid
                            newE.attrib['id'] = newEid
                            newE.attrib['e2'] = newId
                            self.semG.add_edge(e[0], newN, xmlnode=newE)
                    self.semG.remove_node(current)
            # ensure that nodes-to-be-processed have only out-neighbors
            # that have already been processed
            removable = set()
            for x in next_nodes:
                for y in next_nodes:
                    if NX.shortest_path(self.semG, x, y) and not x == y:
                        removable.add(x)
            unprocessed_nodes = next_nodes - removable
Exemple #4
0
    def analyse(self):
        """
        Analyses the parse, determines the unflattening strategy
        for each event, and modifies the cache accordingly.
        """
        def getCoordGrouping(edges):
            """
            Workhorse function for determining the grouping of arguments.

            @type edges: list of 3-tuples
            @param edges: out-going edges of an event to be grouped
            @rtype: list of lists of 3-tuples
            @return: grouped edges
            """
            def coordPath(this,target):
                if depG.has_node(this) and depG.has_node(target):
                    paths1 = [NX.shortest_path(depG,x,this)
                              for x in parentTokens
                              if depG.has_node(x)]
                    paths2 = [NX.shortest_path(depG,x,target)
                              for x in parentTokens
                              if depG.has_node(x)]
                    # same coordination group if there is a pair of
                    # shortest paths from event that start with the same edge
                    # and this edge is not between the tokens of the event
                    start1 = set( [depG[p[0]][p[1]]['type'] for p in paths1
                                   if (p and
                                       len(p)>=2 and
                                       not p[1] in parentTokens)] )
                    start2 = set( [depG[p[0]][p[1]]['type'] for p in paths2
                                   if (p and
                                       len(p)>=2 and
                                       not p[1] in parentTokens)] )
                    return(start1.intersection(start2))
                return(False)
            def connected(e1,e2):
                # do not continue if not within sentence
                if self.mapping.has_key(e1):
                    for t1 in self.mapping[e1]:
                        # do not continue if not within sentence
                        if self.mapping.has_key(e2):
                            for t2 in self.mapping[e2]:
                                if coordPath(t1,t2):
                                    return(True)
                return(False)

            if not edges:
                return([])
            # where does the parent node belong to?
            sentence = self.sentences[Analyser.findSentenceId(edges[0][0])]
            depG = self.depGs[sentence]
            edgemap = dict( [(x[2].attrib['e2'],x) for x in edges] )
            connG = NX.Graph()
            for x in edgemap.values():
                connG.add_node(x)
            pid = edges[0][2].attrib['e1']
            parentTokens = []
            if self.mapping.has_key(pid):
                parentTokens = self.mapping[pid]
            for e1 in edgemap.keys():
                for e2 in edgemap.keys():
                    if not e1==e2:
                        if connected(e1,e2):
                            connG.add_edge(edgemap[e1],edgemap[e2])
            return(NX.connected_components(connG))

        def getGrouping(node):
            """
            Generates the edge groups describing the unflattened event.

            @type node: cElementTree.Element
            @param node: event to be unflattened
            @rtype: list of lists of 3-tuples
            @return: grouped edges
            """
            uid = node.attrib['id']
            t = node.attrib['type']
            # 'neg' edges are not considered and will be
            # removed from the final xml
            edges = [(x[0],x[1],x[2]['xmlnode'])
                      for x in self.semG.out_edges(node,True)
                     if not x[2]['xmlnode'].attrib['type']=='neg']
            if t in ['Gene_expression','Transcription',
                     'Translation','Protein_catabolism']:
                result = [[e] for e in edges]
#                #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
#                                                                     len(result),
#                                                                     [[y[2].attrib['id']
#                                                                       for y in x]
#                                                                      for x in result]))
                return(result)
            elif t=='Localization':
                result = [[e] for e in edges]
#                #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
#                                                                     len(result),
#                                                                     [[y[2].attrib['id']
#                                                                       for y in x]
#                                                                      for x in result]))
                return(result)
            elif t=='Binding':
                # Binding is not perfectly solvable
                if self.perfect:
                    #sys.stderr.write("Skipping %s (%s)\n"%(uid,t))
                    return([edges])
                groups = getCoordGrouping(edges)
                if len(groups)==1:
                    # data suggests that regardless of number of members
                    # in the group, the binding should be split
                    # (cases of >2 are very rare)
                    # (the decision to split events with 2 members is about
                    #  1:1 but splitting is still slightly favoured)
                    result = [[e] for e in edges]
#                    #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
#                                                                         len(result),
#                                                                         [[y[2].attrib['id']
#                                                                           for y in x]
#                                                                          for x in result]))
                    return(result)
                else:
                    # two groups should be split to pairwise combinations
                    # (can 'respectively' be ignored?)
                    # events with more than two proteins are rare
                    # so three or more groups should be treated in a
                    # pairwise manner
                    result = []
                    while groups:
                        g1 = groups.pop()
                        result.extend( [(e1,e2)
                                        for g2 in groups
                                        for e1 in g1
                                        for e2 in g2] )
                    #sys.stderr.write("Generating inter-group pairs for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result]))
                    return(result)
            elif t=='Phosphorylation':
                result = [[e] for e in edges]
#                #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
#                                                                     len(result),
#                                                                     [[y[2].attrib['id']
#                                                                       for y in x]
#                                                                      for x in result]))
                return(result)
            elif t in ['Regulation','Positive_regulation',
                       'Negative_regulation']:
                # Regulation is not perfectly solvable
                # but for now there no better way than the baseline
                # (can 'respectively' be ignored?)
                cause = [x for x in edges if
                         x[2].attrib['type'].startswith('Cause')]
                theme = [x for x in edges if
                         x[2].attrib['type'].startswith('Theme')]
                if cause and theme:
                    result = [(ca,th) for ca in cause for th in theme]
                    #sys.stderr.write("Generating Cause-Theme combinations for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result]))
                    return(result)
                else:
                    result = [[e] for e in edges]
#                    #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t,
#                                                                       len(result),
#                                                                       [[y[2].attrib['id']
#                                                                         for y in x]
#                                                                        for x in result]))
                    return(result)
            elif t in ['Protein']:
                # Proteins have sites (etc.) as successors
                # these edges will be processed later --> do nothing at this point
                result = [edges]
            else:
                if t != "Entity":
                    sys.stderr.write("Invalid event type: %s\n"%t)
            return([edges])

        # unflatten the graph in the cache
        counter = Increment()
        unprocessed_nodes = set([x for x in self.semG.nodes()
                                 if not self.semG.out_edges(x)])
        while unprocessed_nodes:
            next_nodes = set()
            for current in unprocessed_nodes:
                next_nodes.update(set(self.semG.predecessors(current)))
                if self.semG.out_edges(current):
                    groups = getGrouping(current)
                    for edges in groups:
                        evid = counter.get()
                        newN = ET.Element('entity',current.attrib)
                        newId = newN.attrib['id']+'.E'+evid
                        newN.attrib['id'] = newId
                        self.semG.add_node(newN)
                        for e in edges:
                            newE = ET.Element('interaction',e[2].attrib)
                            newEid = newE.attrib['id']+'.E'+evid
                            newE.attrib['id'] = newEid
                            newE.attrib['e1'] = newId
                            self.semG.add_edge(newN,e[1],xmlnode=newE)
                        for e in self.semG.in_edges(current,True):
                            newE = ET.Element('interaction',e[2]['xmlnode'].attrib)
                            newEid = newE.attrib['id']+'.E'+evid
                            newE.attrib['id'] = newEid
                            newE.attrib['e2'] = newId
                            self.semG.add_edge(e[0],newN,xmlnode=newE)
                    self.semG.remove_node(current)
            # ensure that nodes-to-be-processed have only out-neighbors
            # that have already been processed
            removable = set()
            for x in next_nodes:
                for y in next_nodes:
                    if NX.shortest_path(self.semG,x,y) and not x==y:
                        removable.add(x)
            unprocessed_nodes = next_nodes - removable