def getMatchingPhrases(entity, phraseOffsets, phraseDict): matches = [] if entity.get("isName") == "True": return [] maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) minOffset = entity.get("altOffset") if minOffset != None: minOffset = Range.charOffsetToSingleTuple(minOffset) else: minOffset = maxOffset for phraseOffset in phraseOffsets: if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset): matches.extend(phraseDict[phraseOffset]) return matches
def getMatchingPhrases(entity, phraseOffsets, phraseDict): matches = [] if entity.get("isName") == "True": return [] maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) minOffset = entity.get("altOffset") if minOffset != None: minOffset = Range.charOffsetToSingleTuple(minOffset) else: minOffset = maxOffset for phraseOffset in phraseOffsets: if Range.contains(maxOffset, phraseOffset) and Range.contains( phraseOffset, minOffset): matches.extend(phraseDict[phraseOffset]) return matches
def getNECounts(phrases, entities): counts = {} for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) counts[phrase] = 0 for entity in entities: if entity.get("given") != "True": # only check names continue if Range.contains(phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))): counts[phrase] += 1 return counts
def getNECounts(phrases, entities): counts = {} for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) counts[phrase] = 0 for entity in entities: if entity.get("given") != "True": # only check names continue if Range.contains( phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))): counts[phrase] += 1 return counts
def groupDependencies(elements): tokens = sorted(elements.tokens, cmp=orderTokens) indexByTokenId = {} for i in range(len(tokens)): indexByTokenId[tokens[i].get("id")] = i depStructs = [] for dependency in elements.dependencies: depD = {"range":(indexByTokenId[dependency.get("t1")], indexByTokenId[dependency.get("t2")])} if depD["range"][0] > depD["range"][1]: depD["range"] = (depD["range"][1], depD["range"][0]) depD["dep"] = dependency depD["child"] = None depD["childScore"] = 9999 depStructs.append(depD) for d1 in depStructs: for d2 in depStructs: if d1 == d2: continue if d1["range"] != d2["range"] and Range.contains(d1["range"], d2["range"]): score = abs((d2["range"][0] - d1["range"][0]) - (d1["range"][1] - d2["range"][1])) if score < d1["childScore"]: d1["child"] = d2 return depStructs
def mergeSentences(input, output, verbose=False): print >> sys.stderr, "Merging sentences into documents" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): counts["documents"] += 1 # Check that the entity has only sentence elements as children children = [x for x in document] docChildTypes = sorted(set([x.tag for x in children])) if len(docChildTypes) == 0: counts["documents-with-no-sentences"] += 1 continue elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence": raise Exception("Document '" + str(document.get("id")) + "' has non-sentence children: " + str(docChildTypes)) # Process all the child sentence elements docId = document.get("id") interactions = [] entities = [] entityById = {} interactionById = {} combinedText = "" calculatedOffset = (0, 0) for sentence in children: document.remove(sentence) sentenceText = sentence.get("head", "") + sentence.get( "text", "") + sentence.get("tail", "") sentOffset = sentence.get("charOffset") if sentence == children[0]: noDefinedOffsets = sentOffset == None elif (sentOffset == None) != noDefinedOffsets: raise Exception("Only some sentences in document '" + docId + "' have defined offsets") if sentOffset == None: if sentence != children[-1]: sentenceText = sentenceText + " " calculatedOffset = (calculatedOffset[1], calculatedOffset[1] + len(sentenceText)) sentOffset = calculatedOffset else: sentOffset = Range.charOffsetToSingleTuple(sentOffset) combinedText += sentenceText # Collect and update the entity elements for entity in sentence.findall("entity"): # Map sentence-level entity offsets to document level for offsetKey in ("charOffset", "headOffset"): if entity.get(offsetKey) != None: offset = Range.charOffsetToTuples( entity.get(offsetKey)) for i in range(len(offset)): offset[i] = (offset[i][0] + sentOffset[0], offset[i][1] + sentOffset[0]) entity.set(offsetKey, Range.tuplesToCharOffset(offset)) # Compare mapped offsets to origOffset, if available if entity.get("origOffset") != None: if entity.get("charOffset") != entity.get("origOffset"): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' new charOffset differs from origOffset: " + str([ entity.get("charOffset"), entity.get("origOffset") ])) counts["checked-origOffsets"] += 1 del entity.attrib["origOffset"] assert entity.get("id") not in entityById entityById[entity.get( "id" )] = entity # For re-mapping the interaction 'e1' and 'e2' attributes entities.append(entity) counts["moved-entities"] += 1 # Collect and update the interaction elements for interaction in sentence.findall("interaction"): assert interaction.get("id") not in interactionById interactionById[interaction.get( "id" )] = interaction # For re-mapping the interaction 'siteOf' attributes interactions.append(interaction) counts["moved-interactions"] += 1 # Check that the combined sentence text matches the document text, if available if document.get("text") != None and document.get( "text") != combinedText: if combinedText == document.get( "text")[0:len(combinedText)] and document.get( "text")[len(combinedText):].strip() == "": if verbose: print >> sys.stderr, "Warning, document '" + document.get( "id" ) + "' text has trailing whitespace not included in the combined sentence text" combinedText = document.get("text") counts["missing-trailing-whitespace"] += 1 else: raise Exception( "Document '" + str(document.get("id")) + "' text differs from combined sentence text: " + str([document.get("text"), combinedText])) counts["checked-document-texts"] += 1 # Check that the entities' texts match the document text for entity in entities: offset = Range.charOffsetToTuples(entity.get("charOffset")) if len(offset) == 1: # Compare only continous entities if not Range.contains((0, len(combinedText)), offset[0]): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' offset is not contained in combined sentence text: " + str([ entity.attrib, offset, [0, len(combinedText)], combinedText ])) combTextSpan = combinedText[offset[0][0]:offset[0][1]] if entity.get("text") != combTextSpan: raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' text does not match combined sentence text: " + str([entity.get("text"), combTextSpan])) counts["checked-charOffsets"] += 1 # Set the combined text as the document text document.set("text", combinedText) # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping) for i in range(len(entities)): entities[i].set("id", docId + ".e" + str(i)) # Update the id for the document level for i in range(len(interactions)): interaction.set("id", docId + ".i" + str(i)) # Update the id for the document level # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences) for i in range(len(interactions)): interaction = interactions[i] for entKey in ("e1", "e2"): interaction.set(entKey, entityById[interaction.get(entKey)].get("id")) if interaction.get("siteOf") != None: interaction.set( "siteOf", interactionById[interaction.get("siteOf")].get("id")) # Add the entity and interaction elements to the document document.extend(entities) document.extend(interactions) print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path): features = {} if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_"+e1Type)] = 1 features[self.featureSet.getId("e2_"+e2Type)] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["sdb_features"]: e1Type = entity1.get("type") e2Type = entity2.get("type") features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1 features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1 features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1 if e1Type == e2Type: features[self.featureSet.getId("SDB_e1e2_equal")] = 1 features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1 e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type)) e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type)) for e1SuperType in e1SuperTypes: for e2SuperType in e2SuperTypes: features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if e1SuperType == e2SuperType: features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1 features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1 if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2) self.ontobiotopeFeatureBuilder.setFeatureVector(None) if self.styles["full_entities"]: e1Text = entity1.get("text").lower() e2Text = entity2.get("text").lower() features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1 features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1 for ep1 in e1Text.split(): for ep2 in e2Text.split(): features[self.featureSet.getId("FULL_e1_"+ep1)] = 1 features[self.featureSet.getId("FULL_e2_"+ep2)] = 1 features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["wordnet"]: self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2) self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordNetFeatureBuilder.buildPathFeatures(path) self.wordNetFeatureBuilder.setFeatureVector(None) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_") self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_") self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordVectorFeatureBuilder.buildPathFeatures(path) self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2)) self.wordVectorFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) return features
def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): """ Build a single directed example for the potential edge between token1 and token2 """ # dummy return for speed testing #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{}) # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): # path = paths[token1][token2] #else: # path = [token1, token2] if not self.styles["no_path"]: # directedPath reduces performance by 0.01 pp #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2) #if len(directedPath) == 0: # directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1) # for dp in directedPath: # dp.reverse() #if len(directedPath) == 0: # path = paths.getPaths(token1, token2) #else: # path = directedPath path = paths.getPaths(token1, token2) if len(path) > 0: #if len(path) > 1: # print len(path) path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False #print token1.get("id"), token2.get("id") assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if self.styles["trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_limits"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("isName") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("isName") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) #if "graph_kernel" in self.styles or not "no_dependency" in self.styles: # #print "Getting edges" # if token1 != token2 and pathExists: # #print "g1" # edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) # #print "g2" # else: # edges = None if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_"+e1Type)] = 1 features[self.featureSet.getId("e2_"+e2Type)] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not self.styles["no_linear"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_limits"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_limits"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]): if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: #extra["e1GoldIds"] = mergedEntityIds[entity1] extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) #extra["e2GoldIds"] = mergedEntityIds[entity2] extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) # NOTE: temporarily disable for replicating 110310 experiment #features[self.featureSet.getId("extra_constant")] = 1 return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): """ Build a single directed example for the potential edge between token1 and token2 """ # define features features = {} if not self.styles["no_path"]: path = paths.getPaths(token1, token2) if len(path) > 0: path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_"+e1Type)] = 1 features[self.featureSet.getId("e2_"+e2Type)] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) # define extra attributes if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) if entity2 != None: extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId extra["directed"] = str(isDirected) return (categoryName, features, extra)
def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): """ Build a single directed example for the potential edge between token1 and token2 """ # define features features = {} if not self.styles["no_path"]: path = paths.getPaths(token1, token2) if len(path) > 0: path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures( entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector( features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures( sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_" + e1Type)] = 1 features[self.featureSet.getId("e2_" + e2Type)] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector( features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths( sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams( shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_" + e1Type)] = 1 features[self.featureSet.getId("BI_e2_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_" + e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_" + e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_" + e1SuperType + "_" + e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector( features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures( entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) # define extra attributes if int(path[0].get("charOffset").split("-")[0]) < int( path[-1].get("charOffset").split("-")[0]): extra = { "xtype": "edge", "type": "i", "t1": path[0].get("id"), "t2": path[-1].get("id") } extra["deprev"] = False else: extra = { "xtype": "edge", "type": "i", "t1": path[-1].get("id"), "t2": path[0].get("id") } extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e1DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1] ]) if entity2 != None: extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2] ]) extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace( ":", "-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace( ":", "-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId extra["directed"] = str(isDirected) return (categoryName, features, extra)