def coordPath(this,target): if depG.has_node(this) and depG.has_node(target): paths1 = [NX.shortest_path(depG,x,this) for x in parentTokens if depG.has_node(x)] paths2 = [NX.shortest_path(depG,x,target) for x in parentTokens if depG.has_node(x)] # same coordination group if there is a pair of # shortest paths from event that start with the same edge # and this edge is not between the tokens of the event start1 = set( [depG[p[0]][p[1]]['type'] for p in paths1 if (p and len(p)>=2 and not p[1] in parentTokens)] ) start2 = set( [depG[p[0]][p[1]]['type'] for p in paths2 if (p and len(p)>=2 and not p[1] in parentTokens)] ) return(start1.intersection(start2)) return(False)
def coordPath(this, target): if depG.has_node(this) and depG.has_node(target): paths1 = [ NX.shortest_path(depG, x, this) for x in parentTokens if depG.has_node(x) ] paths2 = [ NX.shortest_path(depG, x, target) for x in parentTokens if depG.has_node(x) ] # same coordination group if there is a pair of # shortest paths from event that start with the same edge # and this edge is not between the tokens of the event start1 = set([ depG[p[0]][p[1]]['type'] for p in paths1 if (p and len(p) >= 2 and not p[1] in parentTokens) ]) start2 = set([ depG[p[0]][p[1]]['type'] for p in paths2 if (p and len(p) >= 2 and not p[1] in parentTokens) ]) return (start1.intersection(start2)) return (False)
def analyse(self): """ Analyses the parse, determines the unflattening strategy for each event, and modifies the cache accordingly. """ def getCoordGrouping(edges): """ Workhorse function for determining the grouping of arguments. @type edges: list of 3-tuples @param edges: out-going edges of an event to be grouped @rtype: list of lists of 3-tuples @return: grouped edges """ def coordPath(this, target): if depG.has_node(this) and depG.has_node(target): paths1 = [ NX.shortest_path(depG, x, this) for x in parentTokens if depG.has_node(x) ] paths2 = [ NX.shortest_path(depG, x, target) for x in parentTokens if depG.has_node(x) ] # same coordination group if there is a pair of # shortest paths from event that start with the same edge # and this edge is not between the tokens of the event start1 = set([ depG[p[0]][p[1]]['type'] for p in paths1 if (p and len(p) >= 2 and not p[1] in parentTokens) ]) start2 = set([ depG[p[0]][p[1]]['type'] for p in paths2 if (p and len(p) >= 2 and not p[1] in parentTokens) ]) return (start1.intersection(start2)) return (False) def connected(e1, e2): # do not continue if not within sentence if self.mapping.has_key(e1): for t1 in self.mapping[e1]: # do not continue if not within sentence if self.mapping.has_key(e2): for t2 in self.mapping[e2]: if coordPath(t1, t2): return (True) return (False) if not edges: return ([]) # where does the parent node belong to? sentence = self.sentences[Analyser.findSentenceId(edges[0][0])] depG = self.depGs[sentence] edgemap = dict([(x[2].attrib['e2'], x) for x in edges]) connG = NX.Graph() for x in edgemap.values(): connG.add_node(x) pid = edges[0][2].attrib['e1'] parentTokens = [] if self.mapping.has_key(pid): parentTokens = self.mapping[pid] for e1 in edgemap.keys(): for e2 in edgemap.keys(): if not e1 == e2: if connected(e1, e2): connG.add_edge(edgemap[e1], edgemap[e2]) return (NX.connected_components(connG)) def getGrouping(node): """ Generates the edge groups describing the unflattened event. @type node: cElementTree.Element @param node: event to be unflattened @rtype: list of lists of 3-tuples @return: grouped edges """ uid = node.attrib['id'] t = node.attrib['type'] # 'neg' edges are not considered and will be # removed from the final xml edges = [(x[0], x[1], x[2]['xmlnode']) for x in self.semG.out_edges(node, True) if not x[2]['xmlnode'].attrib['type'] == 'neg'] if t in [ 'Gene_expression', 'Transcription', 'Translation', 'Protein_catabolism' ]: result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return (result) elif t == 'Localization': result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return (result) elif t == 'Binding': # Binding is not perfectly solvable if self.perfect: #sys.stderr.write("Skipping %s (%s)\n"%(uid,t)) return ([edges]) groups = getCoordGrouping(edges) if len(groups) == 1: # data suggests that regardless of number of members # in the group, the binding should be split # (cases of >2 are very rare) # (the decision to split events with 2 members is about # 1:1 but splitting is still slightly favoured) result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return (result) else: # two groups should be split to pairwise combinations # (can 'respectively' be ignored?) # events with more than two proteins are rare # so three or more groups should be treated in a # pairwise manner result = [] while groups: g1 = groups.pop() result.extend([(e1, e2) for g2 in groups for e1 in g1 for e2 in g2]) #sys.stderr.write("Generating inter-group pairs for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result])) return (result) elif t == 'Phosphorylation': result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return (result) elif t in [ 'Regulation', 'Positive_regulation', 'Negative_regulation' ]: # Regulation is not perfectly solvable # but for now there no better way than the baseline # (can 'respectively' be ignored?) cause = [ x for x in edges if x[2].attrib['type'].startswith('Cause') ] theme = [ x for x in edges if x[2].attrib['type'].startswith('Theme') ] if cause and theme: result = [(ca, th) for ca in cause for th in theme] #sys.stderr.write("Generating Cause-Theme combinations for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result])) return (result) else: result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return (result) elif t in ['Protein']: # Proteins have sites (etc.) as successors # these edges will be processed later --> do nothing at this point result = [edges] else: if t != "Entity": sys.stderr.write("Invalid event type: %s\n" % t) return ([edges]) # unflatten the graph in the cache counter = Increment() unprocessed_nodes = set( [x for x in self.semG.nodes() if not self.semG.out_edges(x)]) while unprocessed_nodes: next_nodes = set() for current in unprocessed_nodes: next_nodes.update(set(self.semG.predecessors(current))) if self.semG.out_edges(current): groups = getGrouping(current) for edges in groups: evid = counter.get() newN = ET.Element('entity', current.attrib) newId = newN.attrib['id'] + '.E' + evid newN.attrib['id'] = newId self.semG.add_node(newN) for e in edges: newE = ET.Element('interaction', e[2].attrib) newEid = newE.attrib['id'] + '.E' + evid newE.attrib['id'] = newEid newE.attrib['e1'] = newId self.semG.add_edge(newN, e[1], xmlnode=newE) for e in self.semG.in_edges(current, True): newE = ET.Element('interaction', e[2]['xmlnode'].attrib) newEid = newE.attrib['id'] + '.E' + evid newE.attrib['id'] = newEid newE.attrib['e2'] = newId self.semG.add_edge(e[0], newN, xmlnode=newE) self.semG.remove_node(current) # ensure that nodes-to-be-processed have only out-neighbors # that have already been processed removable = set() for x in next_nodes: for y in next_nodes: if NX.shortest_path(self.semG, x, y) and not x == y: removable.add(x) unprocessed_nodes = next_nodes - removable
def analyse(self): """ Analyses the parse, determines the unflattening strategy for each event, and modifies the cache accordingly. """ def getCoordGrouping(edges): """ Workhorse function for determining the grouping of arguments. @type edges: list of 3-tuples @param edges: out-going edges of an event to be grouped @rtype: list of lists of 3-tuples @return: grouped edges """ def coordPath(this,target): if depG.has_node(this) and depG.has_node(target): paths1 = [NX.shortest_path(depG,x,this) for x in parentTokens if depG.has_node(x)] paths2 = [NX.shortest_path(depG,x,target) for x in parentTokens if depG.has_node(x)] # same coordination group if there is a pair of # shortest paths from event that start with the same edge # and this edge is not between the tokens of the event start1 = set( [depG[p[0]][p[1]]['type'] for p in paths1 if (p and len(p)>=2 and not p[1] in parentTokens)] ) start2 = set( [depG[p[0]][p[1]]['type'] for p in paths2 if (p and len(p)>=2 and not p[1] in parentTokens)] ) return(start1.intersection(start2)) return(False) def connected(e1,e2): # do not continue if not within sentence if self.mapping.has_key(e1): for t1 in self.mapping[e1]: # do not continue if not within sentence if self.mapping.has_key(e2): for t2 in self.mapping[e2]: if coordPath(t1,t2): return(True) return(False) if not edges: return([]) # where does the parent node belong to? sentence = self.sentences[Analyser.findSentenceId(edges[0][0])] depG = self.depGs[sentence] edgemap = dict( [(x[2].attrib['e2'],x) for x in edges] ) connG = NX.Graph() for x in edgemap.values(): connG.add_node(x) pid = edges[0][2].attrib['e1'] parentTokens = [] if self.mapping.has_key(pid): parentTokens = self.mapping[pid] for e1 in edgemap.keys(): for e2 in edgemap.keys(): if not e1==e2: if connected(e1,e2): connG.add_edge(edgemap[e1],edgemap[e2]) return(NX.connected_components(connG)) def getGrouping(node): """ Generates the edge groups describing the unflattened event. @type node: cElementTree.Element @param node: event to be unflattened @rtype: list of lists of 3-tuples @return: grouped edges """ uid = node.attrib['id'] t = node.attrib['type'] # 'neg' edges are not considered and will be # removed from the final xml edges = [(x[0],x[1],x[2]['xmlnode']) for x in self.semG.out_edges(node,True) if not x[2]['xmlnode'].attrib['type']=='neg'] if t in ['Gene_expression','Transcription', 'Translation','Protein_catabolism']: result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return(result) elif t=='Localization': result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return(result) elif t=='Binding': # Binding is not perfectly solvable if self.perfect: #sys.stderr.write("Skipping %s (%s)\n"%(uid,t)) return([edges]) groups = getCoordGrouping(edges) if len(groups)==1: # data suggests that regardless of number of members # in the group, the binding should be split # (cases of >2 are very rare) # (the decision to split events with 2 members is about # 1:1 but splitting is still slightly favoured) result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return(result) else: # two groups should be split to pairwise combinations # (can 'respectively' be ignored?) # events with more than two proteins are rare # so three or more groups should be treated in a # pairwise manner result = [] while groups: g1 = groups.pop() result.extend( [(e1,e2) for g2 in groups for e1 in g1 for e2 in g2] ) #sys.stderr.write("Generating inter-group pairs for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result])) return(result) elif t=='Phosphorylation': result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return(result) elif t in ['Regulation','Positive_regulation', 'Negative_regulation']: # Regulation is not perfectly solvable # but for now there no better way than the baseline # (can 'respectively' be ignored?) cause = [x for x in edges if x[2].attrib['type'].startswith('Cause')] theme = [x for x in edges if x[2].attrib['type'].startswith('Theme')] if cause and theme: result = [(ca,th) for ca in cause for th in theme] #sys.stderr.write("Generating Cause-Theme combinations for %s (%s) - %s\n"%(uid,t,[[y[2].attrib['id'] for y in x] for x in result])) return(result) else: result = [[e] for e in edges] # #sys.stderr.write("Splitting %s (%s) into %s - %s\n"%(uid,t, # len(result), # [[y[2].attrib['id'] # for y in x] # for x in result])) return(result) elif t in ['Protein']: # Proteins have sites (etc.) as successors # these edges will be processed later --> do nothing at this point result = [edges] else: if t != "Entity": sys.stderr.write("Invalid event type: %s\n"%t) return([edges]) # unflatten the graph in the cache counter = Increment() unprocessed_nodes = set([x for x in self.semG.nodes() if not self.semG.out_edges(x)]) while unprocessed_nodes: next_nodes = set() for current in unprocessed_nodes: next_nodes.update(set(self.semG.predecessors(current))) if self.semG.out_edges(current): groups = getGrouping(current) for edges in groups: evid = counter.get() newN = ET.Element('entity',current.attrib) newId = newN.attrib['id']+'.E'+evid newN.attrib['id'] = newId self.semG.add_node(newN) for e in edges: newE = ET.Element('interaction',e[2].attrib) newEid = newE.attrib['id']+'.E'+evid newE.attrib['id'] = newEid newE.attrib['e1'] = newId self.semG.add_edge(newN,e[1],xmlnode=newE) for e in self.semG.in_edges(current,True): newE = ET.Element('interaction',e[2]['xmlnode'].attrib) newEid = newE.attrib['id']+'.E'+evid newE.attrib['id'] = newEid newE.attrib['e2'] = newId self.semG.add_edge(e[0],newN,xmlnode=newE) self.semG.remove_node(current) # ensure that nodes-to-be-processed have only out-neighbors # that have already been processed removable = set() for x in next_nodes: for y in next_nodes: if NX.shortest_path(self.semG,x,y) and not x==y: removable.add(x) unprocessed_nodes = next_nodes - removable