def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML(data, model, None, workOutputTag, model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) if (validate): self.structureAnalyzer.load(model) self.structureAnalyzer.validate(xml) ETUtils.write(xml, output+"-pred.xml.gz") else: shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: if task == None: task = self.getStr(self.tag+"task", model) self.stEvaluator.evaluate(output+"-events" + extension, task) self.deleteTempWorkDir() self.exitState()
def evaluateChemProt(xml, gold): EvaluateIXML.run(AveragingMultiClassEvaluator, xml, gold, "McCC") preprocessor = Preprocessor(steps=["EXPORT_CHEMPROT"]) tempDir = tempfile.mkdtemp() print >> sys.stderr, "Using temporary evaluation directory", tempDir tsvPath = os.path.join(tempDir, "predictions.tsv") preprocessor.process(xml, tsvPath) ChemProtEvaluator().evaluateTSV(tsvPath, tempDir) print >> sys.stderr, "Removing temporary evaluation directory", tempDir shutil.rmtree(tempDir)
def classify(self, data, model, output, parse=None, task=None): self.enterState(self.STATE_CLASSIFY) model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) if task == None: task = self.getStr(self.tag+"task", model) xml = self.classifyToXML(data, model, None, output + "-", model.get(self.tag+"classifier-model"), None, parse, float(model.get("recallAdjustParameter"))) EvaluateInteractionXML.run(self.evaluator, xml, data, parse) STFormat.ConvertXML.toSTFormat(xml, output+".tar.gz", outputTag="a2") if self.stEvaluator != None: self.stEvaluator.evaluate(output+".tar.gz", task) self.exitState()
def trainUnmergingDetector(self): xml = None if not self.unmerging: print >> sys.stderr, "No unmerging" if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging: # Self-classified train data for unmerging if self.doUnmergingSelfTraining: # This allows limiting to a subcorpus triggerStyle = copy.copy(Parameters.get(self.triggerExampleStyle)) edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle)) unmergingStyle = Parameters.get(self.unmergingExampleStyle) if "sentenceLimit" in unmergingStyle and unmergingStyle["sentenceLimit"]: triggerStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"] edgeStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"] # Build the examples xml = self.triggerDetector.classifyToXML(self.trainData, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=triggerStyle)#, recallAdjust=0.5) xml = self.edgeDetector.classifyToXML(xml, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=edgeStyle)#, recallAdjust=0.5) assert xml != None EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse) else: print >> sys.stderr, "No self-training for unmerging" if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging: # Unmerging example generation GOLD_TEST_FILE = self.optData.replace("-nodup", "") GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "") if self.doUnmergingSelfTraining: if xml == None: xml = self.workDir+"unmerging-extra-edge-pred.xml.gz" self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml]], [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None else: self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "")], [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], [GOLD_TEST_FILE, GOLD_TRAIN_FILE], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True) if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.beginModel(None, self.model, self.workDir+"unmerging-train-examples.gz", self.workDir+"unmerging-opt-examples.gz") if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.endModel(None, self.model, self.workDir+"unmerging-opt-examples.gz") print >> sys.stderr, "Adding unmerging classifier model to test-set event model" if self.combinedModel != None: self.combinedModel.addStr("unmerging-example-style", self.model.getStr("unmerging-example-style")) self.combinedModel.insert(self.model.get("unmerging-ids.classes"), "unmerging-ids.classes") self.combinedModel.insert(self.model.get("unmerging-ids.features"), "unmerging-ids.features") self.unmergingDetector.addClassifierModel(self.combinedModel, self.model.get("unmerging-classifier-model", True), self.model.getStr("unmerging-classifier-parameter")) self.combinedModel.save()
def classify(self, data, model, output, parse=None, task=None): self.enterState(self.STATE_CLASSIFY) model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) if task == None: task = self.getStr(self.tag + "task", model) xml = self.classifyToXML(data, model, None, output + "-", model.get(self.tag + "classifier-model"), None, parse, float(model.get("recallAdjustParameter"))) EvaluateInteractionXML.run(self.evaluator, xml, data, parse) STFormat.ConvertXML.toSTFormat(xml, output + ".tar.gz", outputTag="a2") if self.stEvaluator != None: self.stEvaluator.evaluate(output + ".tar.gz", task) self.exitState()
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML( data, model, None, workOutputTag, model.get(self.tag + "classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) if (validate): self.structureAnalyzer.load(model) self.structureAnalyzer.validate(xml) ETUtils.write(xml, output + "-pred.xml.gz") else: shutil.copy2(workOutputTag + self.tag + "pred.xml.gz", output + "-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.ConvertXML.toSTFormat( xml, output + "-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: if task == None: task = self.getStr(self.tag + "task", model) self.stEvaluator.evaluate(output + "-events" + extension, task) self.deleteTempWorkDir() self.exitState()
def evaluateGrid(self, xml, params, bestResults): if xml != None: # TODO: Where should the EvaluateInteractionXML evaluator come from? EIXMLResult = EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.optData, self.parse) # Convert to ST-format STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-flat-geniaformat", "a2") #getA2FileTag(options.task, subTask)) stFormatDir = self.workDir+"grid-flat-geniaformat" if self.unmerging: xml = self.unmergingDetector.classifyToXML(xml, self.model, None, self.workDir+"grid-", goldData=self.optData.replace("-nodup", "")) STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-unmerging-geniaformat", "a2") stFormatDir = self.workDir+"grid-unmerging-geniaformat" stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task) if stEvaluation != None: if bestResults == None or stEvaluation[0] > bestResults[1][0]: bestResults = (params, stEvaluation, stEvaluation[0]) else: if bestResults == None or EIXMLResult.getData().fscore > bestResults[1].getData().fscore: bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore) shutil.rmtree(self.workDir+"grid-flat-geniaformat") if os.path.exists(self.workDir+"grid-unmerging-geniaformat"): shutil.rmtree(self.workDir+"grid-unmerging-geniaformat") else: print >> sys.stderr, "No predicted edges" return bestResults
def evaluateGrid(self, xml, params, bestResults): if xml != None: # TODO: Where should the EvaluateInteractionXML evaluator come from? EIXMLResult = EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.optData, self.parse) # Convert to ST-format if self.unmerging: xml = self.unmergingDetector.classifyToXML(xml, self.model, None, self.workDir+"grid-", goldData=self.optData) #self.structureAnalyzer.validate(xml) if self.bioNLPSTParams["evaluate"]: Utils.STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-unmerging-geniaformat", "a2") stFormatDir = self.workDir+"grid-unmerging-geniaformat" elif self.bioNLPSTParams["evaluate"]: #self.structureAnalyzer.validate(xml) Utils.STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-flat-geniaformat", "a2") #getA2FileTag(options.task, subTask)) stFormatDir = self.workDir+"grid-flat-geniaformat" # Evaluation # Attempt shared task evaluation stEvaluation = None if self.bioNLPSTParams["evaluate"]: stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task) if stEvaluation != None: if bestResults == None or stEvaluation[0] > bestResults[1][0]: bestResults = (params, stEvaluation, stEvaluation[0]) else: # If shared task evaluation was not done (failed or not requested) fall back to internal evaluation if bestResults == None or EIXMLResult.getData().fscore > bestResults[1].getData().fscore: bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore) # Remove ST-format files if os.path.exists(self.workDir+"grid-flat-geniaformat"): shutil.rmtree(self.workDir+"grid-flat-geniaformat") if os.path.exists(self.workDir+"grid-unmerging-geniaformat"): shutil.rmtree(self.workDir+"grid-unmerging-geniaformat") else: print >> sys.stderr, "No predicted edges" return bestResults
def classify(self, data, model, output, parse=None, task=None, fromStep=None, toStep=None): BINARY_RECALL_MODE = False # TODO: make a parameter xml = None self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse) self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep) #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep) self.model = self.openModel(self.model, "r") if self.checkStep("TRIGGERS"): xml = self.triggerDetector.classifyToXML(self.classifyData, self.model, None, output + "-", parse=self.parse, recallAdjust=float(self.getStr("recallAdjustParameter", self.model))) if self.checkStep("EDGES"): xml = self.getWorkFile(xml, output + "-recall-adjusted.xml.gz") xml = self.edgeDetector.classifyToXML(xml, self.model, None, output + "-", parse=self.parse) assert xml != None if self.parse == None: edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model) else: edgeParse = self.parse #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, None, edgeParse) if self.checkStep("UNMERGING"): if self.model.hasMember("unmerging-classifier-model"): #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz") # To avoid running out of memory, always use file on disk xml = self.getWorkFile(None, output + "-edge-pred.xml.gz") goldData = None if type(self.classifyData) in types.StringTypes: if os.path.exists(self.classifyData.replace("-nodup", "")): goldData = self.classifyData.replace("-nodup", "") xml = self.unmergingDetector.classifyToXML(xml, self.model, None, output + "-", goldData=goldData, parse=self.parse) else: print >> sys.stderr, "No model for unmerging" if self.checkStep("MODIFIERS"): if self.model.hasMember("modifier-classifier-model"): xml = self.getWorkFile(xml, [output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz"]) xml = self.modifierDetector.classifyToXML(xml, self.model, None, output + "-", parse=self.parse) else: print >> sys.stderr, "No model for modifier detection" if self.checkStep("ST-CONVERT"): xml = self.getWorkFile(xml, [output + "-modifier-pred.xml.gz", output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz"]) STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2", writeScores=self.stWriteScores) if self.stEvaluator != None: task = self.task if task == None: task = self.getStr(self.edgeDetector.tag+"task", self.model) self.stEvaluator.evaluate(output + "-events.tar.gz", task) self.exitState()
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) if task == None: task = self.getStr(self.tag+"task", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML(data, model, None, workOutputTag, model.get(self.tag+"classifier-model"), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2") if stParams["evaluate"]: #self.stEvaluator != None: self.stEvaluator.evaluate(output+"-events.tar.gz", task) self.deleteTempWorkDir() self.exitState()
def evaluateGrid(self, xml, params, bestResults): #traceback.print_stack() #pdb.set_trace() if xml != None: # TODO: Where should the EvaluateInteractionXML evaluator come from? EIXMLResult = EvaluateInteractionXML.run( self.edgeDetector.evaluator, xml, self.optData, self.parse) # Convert to ST-format if self.unmerging: xml = self.unmergingDetector.classifyToXML( xml, self.model, None, self.workDir + "grid-", goldData=self.optData) #self.structureAnalyzer.validate(xml) if self.bioNLPSTParams["evaluate"]: Utils.STFormat.ConvertXML.toSTFormat( xml, self.workDir + "grid-unmerging-geniaformat", "a2") stFormatDir = self.workDir + "grid-unmerging-geniaformat" elif self.bioNLPSTParams["evaluate"]: #self.structureAnalyzer.validate(xml) Utils.STFormat.ConvertXML.toSTFormat( xml, self.workDir + "grid-flat-geniaformat", "a2") #getA2FileTag(options.task, subTask)) stFormatDir = self.workDir + "grid-flat-geniaformat" # Evaluation # Attempt shared task evaluation stEvaluation = None if self.bioNLPSTParams["evaluate"]: stEvaluation = self.stEvaluator.evaluate( stFormatDir, self.task) if stEvaluation != None: if bestResults == None or stEvaluation[0] > bestResults[1][0]: bestResults = (params, stEvaluation, stEvaluation[0]) else: # If shared task evaluation was not done (failed or not requested) fall back to internal evaluation if bestResults == None or EIXMLResult.getData( ).fscore > bestResults[1].getData().fscore: bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore) # Remove ST-format files if os.path.exists(self.workDir + "grid-flat-geniaformat"): shutil.rmtree(self.workDir + "grid-flat-geniaformat") if os.path.exists(self.workDir + "grid-unmerging-geniaformat"): shutil.rmtree(self.workDir + "grid-unmerging-geniaformat") else: print >> sys.stderr, "No predicted edges" return bestResults
def evaluateGrid(self, xml, params, bestResults): if xml != None: # TODO: Where should the EvaluateInteractionXML evaluator come from? EIXMLResult = EvaluateInteractionXML.run( self.edgeDetector.evaluator, xml, self.optData, self.parse) # Convert to ST-format STFormat.ConvertXML.toSTFormat( xml, self.workDir + "grid-flat-geniaformat", "a2") #getA2FileTag(options.task, subTask)) stFormatDir = self.workDir + "grid-flat-geniaformat" if self.unmerging: xml = self.unmergingDetector.classifyToXML( xml, self.model, None, self.workDir + "grid-", goldData=self.optData.replace("-nodup", "")) STFormat.ConvertXML.toSTFormat( xml, self.workDir + "grid-unmerging-geniaformat", "a2") stFormatDir = self.workDir + "grid-unmerging-geniaformat" stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task) if stEvaluation != None: if bestResults == None or stEvaluation[0] > bestResults[1][0]: bestResults = (params, stEvaluation, stEvaluation[0]) else: if bestResults == None or EIXMLResult.getData( ).fscore > bestResults[1].getData().fscore: bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore) shutil.rmtree(self.workDir + "grid-flat-geniaformat") if os.path.exists(self.workDir + "grid-unmerging-geniaformat"): shutil.rmtree(self.workDir + "grid-unmerging-geniaformat") else: print >> sys.stderr, "No predicted edges" return bestResults
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_" + text): bagOfWords["spec_bow_" + text] = 0 bagOfWords["spec_bow_" + text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("isName") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = { "xtype": "task3", "t3type": task3Type, "t": token.get("id"), "entity": entity.get("id") } #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get("given") == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_"+text): bagOfWords["spec_bow_"+text] = 0 bagOfWords["spec_bow_"+text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k,v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("given") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")} #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None}) models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"}) exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None}) folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters useKerasDetector = False if detector != None and "keras" in detector.lower(): print >> sys.stderr, "Using a Keras Detector" useKerasDetector = True if detector.lower() == "keras": detector = None detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") if "." in task: _, subTask = getSubTask(task) if subTask != 3: processModifiers = False # Preprocess the corpus if required if corpusPreprocessing != None: preprocessor = Preprocessor(steps=corpusPreprocessing) assert preprocessor.steps[0].name == "MERGE_SETS" assert preprocessor.steps[-1].name == "DIVIDE_SETS" preprocessedCorpusDir = os.path.join(output, "corpus") #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles} preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task)) #inputFiles = outputFiles for setName in inputFiles.keys(): if inputFiles[setName] != None: inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector, evaluator=evaluator) evaluator, evaluatorName = importClass(evaluator, "evaluator") detector = detector() # initialize object if evaluator != None: print >> sys.stderr, "Using evaluator", evaluator.__name__ detector.evaluator = evaluator detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if not isinstance(detector, EventDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if evaluatorName != None: model.addStr("detector", evaluatorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files removalScope = "non-given" if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]): removalScope = "all" elif "Edge" in detector.__class__.__name__: removalScope = "interactions" detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") print >> sys.stderr, "*** Evaluate empty devel classification ***" if os.path.exists("classification-empty/devel-empty-pred.xml.gz"): EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse) else: print >> sys.stderr, "No output file for evaluation" if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2") # Stop logging if log != None: Stream.closeLog(log)
def classify(self, data, model, output, parse=None, task=None, goldData=None, fromStep=None, toStep=None, omitSteps=None, workDir=None): #BINARY_RECALL_MODE = False # TODO: make a parameter xml = None model = self.openModel(model, "r") self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse) self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep, omitSteps) #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") self.model = self.openModel(self.model, "r") stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if self.checkStep("TRIGGERS"): xml = self.triggerDetector.classifyToXML(self.classifyData, self.model, None, workOutputTag, goldData=goldData, parse=self.parse, recallAdjust=float(self.getStr("recallAdjustParameter", self.model))) if self.checkStep("EDGES"): xml = self.getWorkFile(xml, workOutputTag + "trigger-pred.xml.gz") xml = self.edgeDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse) assert xml != None if self.parse == None: edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model) else: edgeParse = self.parse #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) if goldData != None: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse) else: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) if self.checkStep("UNMERGING"): if self.model.getStr("unmerging-classifier-parameter", None) != None: #self.model.hasMember("unmerging-classifier-model"): #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz") # To avoid running out of memory, always use file on disk xml = self.getWorkFile(None, workOutputTag + "edge-pred.xml.gz") #goldData = None #if type(self.classifyData) in types.StringTypes: # if os.path.exists(self.classifyData.replace("-nodup", "")): # goldData = self.classifyData.replace("-nodup", "") xml = self.unmergingDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse) # Evaluate after unmerging if self.parse == None: edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model) else: edgeParse = self.parse if goldData != None: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse) else: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) else: print >> sys.stderr, "No model for unmerging" if self.checkStep("MODIFIERS"): if self.model.hasMember("modifier-classifier-model"): xml = self.getWorkFile(xml, [workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) xml = self.modifierDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse) else: print >> sys.stderr, "No model for modifier detection" # if self.checkStep("VALIDATE"): # xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) # self.structureAnalyzer.load(model) # self.structureAnalyzer.validate(xml) # ETUtils.write(xml, workOutputTag + "validate-pred.xml.gz") if self.checkStep("ST-CONVERT"): if stParams["convert"]: #xml = self.getWorkFile(xml, [workOutputTag + "validate-pred.xml.gz", workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: task = self.task if task == None: task = self.getStr(self.edgeDetector.tag+"task", self.model) self.stEvaluator.evaluate(output + "-events.tar.gz", task) else: print >> sys.stderr, "No BioNLP shared task format conversion" finalXMLFile = self.getWorkFile(None, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) if finalXMLFile != None: shutil.copy2(finalXMLFile, output+"-pred.xml.gz") self.deleteTempWorkDir() self.exitState()
CLASSIFIER_PARAMS="c:25000,50000,87500" WORKDIR="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest" PARSE_TOK="split-Charniak-Lease" workdir(WORKDIR, False) log() # Trigger detection #Gazetteer.run(TRAIN_FILE, "gazetteer-train") #GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000", "trigger-test-classifications") evaluator = Ev.evaluate("trigger-test-examples", "trigger-test-classifications", "trigger-ids.class_names") #evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\ # "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0] ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE, "test-predicted-triggers.xml", "trigger-ids.class_names", PARSE_TOK, PARSE_TOK) # RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml") # ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml") # ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True) # EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK) ix.splitMergedElements("test-predicted-triggers.xml", "test-predicted-triggers-split.xml") ix.recalculateIds("test-predicted-triggers-split.xml", "test-predicted-triggers-split-recids.xml", True) EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 # example directionality if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True elif self.styles["directed"]: assert self.styles["undirected"] in [None, False] examplesAreDirected = True elif self.styles["undirected"]: assert self.styles["directed"] in [None, False] examplesAreDirected = False if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # if self.styles["sdb_merge"]: # self.determineNonOverlappingTypes(structureAnalyzer) # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and paths.resetAnalyses() # just in case paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) # Generate examples based on interactions between entities or interactions between tokens if self.styles["token_nodes"]: loopRange = len(sentenceGraph.tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["token_nodes"]: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected) for categoryName, features, extra in examples: # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 extra["categoryName"] = "i" else: category = self.classSet.getId(categoryName) example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra] ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 return exampleIndex
def makeExampleGraphWithGold(self, builder, sentenceGraph, goldGraph, sentenceIndex): exampleGraph = NX10.MultiDiGraph() for token in goldGraph.tokens: exampleGraph.add_node(token) arcStyles = {} labelStyles = {} extraByToken = {} edgeTypes = {} stats = {"entities":0,"edges":0,"tp":0,"fp":0,"tn":0,"fn":0} entityMap = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities, goldGraph.tokens) tokenMap = self.getTokenMap(sentenceGraph, goldGraph) toEntitiesWithPredictions = set() for entityFrom, entitiesTo in entityMap.iteritems(): stats["entities"] += 1 entityFromHeadToken = sentenceGraph.entityHeadTokenByEntity[entityFrom] for entityTo in entitiesTo: toEntitiesWithPredictions.add(entityTo) entityToHeadToken = goldGraph.entityHeadTokenByEntity[entityTo] style = None eFromType = entityFrom.get("type") eToType = entityTo.get("type") if extraByToken.has_key(entityToHeadToken): style = extraByToken[entityToHeadToken] if eFromType == eToType: if eToType != "neg": if style == None: style = [entityTo.get("type"),{"fill":"green"}] elif style[1]["fill"] == "#79BAEC": style = [entityTo.get("type"),{"fill":"green"}] if entityTo.get("isName") == "True": style = [entityTo.get("type"),{"fill":"brown"}] else: stats["tp"] += 1 else: if eToType == "neg": pass extraByToken[entityToHeadToken] = style if len(entitiesTo) == 0: stats["fp"] += 1 if extraByToken.has_key(tokenMap[entityFromHeadToken]): style = extraByToken[tokenMap[entityFromHeadToken]] if style[1]["fill"] != "green": style = [entityFrom.get("type"),{"fill":"red"}] extraByToken[tokenMap[entityFromHeadToken]] = style else: extraByToken[tokenMap[entityFromHeadToken]] = [entityFrom.get("type"),{"fill":"red"}] for entity in goldGraph.entities: if entity not in toEntitiesWithPredictions: stats["fn"] += 1 extraByToken[goldGraph.entityHeadTokenByEntity[entity]] = [entity.get("type"),{"fill":"#79BAEC"}] toInteractionsWithPredictions = set() for interactionFrom in sentenceGraph.interactions: if interactionFrom.get("type") == "neg": continue stats["edges"] += 1 e1s = entityMap[sentenceGraph.entitiesById[interactionFrom.get("e1")]] e1Ids = [] for e1 in e1s: e1Ids.append(e1.get("id")) e2s = entityMap[sentenceGraph.entitiesById[interactionFrom.get("e2")]] e2Ids = [] for e2 in e2s: e2Ids.append(e2.get("id")) t1 = tokenMap[sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[interactionFrom.get("e1")]]] t2 = tokenMap[sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[interactionFrom.get("e2")]]] iFromType = interactionFrom.get("type") found = False for interactionTo in goldGraph.interactions: if interactionTo.get("e1") in e1Ids and interactionTo.get("e2") in e2Ids: toInteractionsWithPredictions.add(interactionTo) iToType = interactionTo.get("type") exampleGraph.add_edge(t1, t2, element=interactionFrom) #edge = exampleGraph.get_edge(t1, t2, data=True) edge = self.getNXEdge(exampleGraph, t1, t2, interactionFrom) if t1 != t2: if iToType == iFromType: edge[2]["arcStyles"] = {"stroke":"green"} edge[2]["labelStyles"] = {"fill":"green"} stats["tp"] += 1 else: edge[2]["arcStyles"] = {"stroke":"red"} edge[2]["labelStyles"] = {"fill":"red"} stats["fp"] += 1 found = True if not found: # false positive prediction if t1 != t2: exampleGraph.add_edge(t1, t2, element=interactionFrom) edge = self.getNXEdge(exampleGraph, t1, t2, interactionFrom) edge[2]["arcStyles"] = {"stroke":"red"} edge[2]["labelStyles"] = {"fill":"red"} stats["fp"] += 1 for interactionTo in goldGraph.interactions: if interactionTo not in toInteractionsWithPredictions: # false negative gold t1 = goldGraph.entityHeadTokenByEntity[goldGraph.entitiesById[interactionTo.get("e1")]] t2 = goldGraph.entityHeadTokenByEntity[goldGraph.entitiesById[interactionTo.get("e2")]] if t1 != t2: exampleGraph.add_edge(t1, t2, element=interactionTo) edge = self.getNXEdge(exampleGraph, t1, t2, interactionTo) edge[2]["arcStyles"] = {"stroke":"#79BAEC"} edge[2]["labelStyles"] = {"fill":"#79BAEC"} stats["fn"] += 1 builder.header("Classification",4) svgTokens = GraphToSVG.tokensToSVG(goldGraph.tokens,False,None,extraByToken) #arcStyles, labelStyles = self.getMatchingEdgeStyles(exampleGraph, sentenceGraph.interactionGraph, "green", "red" ) svgEdges = GraphToSVG.edgesToSVG(svgTokens, exampleGraph, "type", None) sentenceId = sentenceGraph.getSentenceId() svgElement = GraphToSVG.writeSVG(svgTokens, svgEdges, self.outDir+"/svg/"+sentenceId+"-"+str(sentenceIndex)+"_learned.svg") builder.svg("../svg/" + sentenceId + "-"+str(sentenceIndex)+"_learned.svg",svgElement.attrib["width"],svgElement.attrib["height"],id="learned_graph") builder.lineBreak() return stats
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange - 1): for j in range(i + 1, loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get( "source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len( sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([ self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([ self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample( tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def classify(self, data, model, output, parse=None, task=None, goldData=None, fromStep=None, toStep=None, omitSteps=None, workDir=None): #BINARY_RECALL_MODE = False # TODO: make a parameter xml = None model = self.openModel(model, "r") self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse) self.enterState( self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep, omitSteps) #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") self.model = self.openModel(self.model, "r") stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if self.checkStep("TRIGGERS"): xml = self.triggerDetector.classifyToXML( self.classifyData, self.model, None, workOutputTag, goldData=goldData, parse=self.parse, recallAdjust=float( self.getStr("recallAdjustParameter", self.model))) if self.checkStep("EDGES"): xml = self.getWorkFile(xml, workOutputTag + "trigger-pred.xml.gz") xml = self.edgeDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse) assert xml != None if self.parse == None: edgeParse = self.getStr(self.edgeDetector.tag + "parse", self.model) else: edgeParse = self.parse #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) if goldData != None: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse) else: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) if self.checkStep("UNMERGING"): if self.model.getStr( "unmerging-classifier-parameter", None ) != None: #self.model.hasMember("unmerging-classifier-model"): #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz") # To avoid running out of memory, always use file on disk xml = self.getWorkFile(None, workOutputTag + "edge-pred.xml.gz") #goldData = None #if type(self.classifyData) in types.StringTypes: # if os.path.exists(self.classifyData.replace("-nodup", "")): # goldData = self.classifyData.replace("-nodup", "") xml = self.unmergingDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse) # Evaluate after unmerging if self.parse == None: edgeParse = self.getStr(self.edgeDetector.tag + "parse", self.model) else: edgeParse = self.parse if goldData != None: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse) else: EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) else: print >> sys.stderr, "No model for unmerging" if self.checkStep("MODIFIERS"): if self.model.hasMember("modifier-classifier-model"): xml = self.getWorkFile(xml, [ workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz" ]) xml = self.modifierDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse) else: print >> sys.stderr, "No model for modifier detection" # if self.checkStep("VALIDATE"): # xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) # self.structureAnalyzer.load(model) # self.structureAnalyzer.validate(xml) # ETUtils.write(xml, workOutputTag + "validate-pred.xml.gz") if self.checkStep("ST-CONVERT"): if stParams["convert"]: #xml = self.getWorkFile(xml, [workOutputTag + "validate-pred.xml.gz", workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"]) xml = self.getWorkFile(xml, [ workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz" ]) Utils.STFormat.ConvertXML.toSTFormat( xml, output + "-events.tar.gz", outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: task = self.task if task == None: task = self.getStr(self.edgeDetector.tag + "task", self.model) self.stEvaluator.evaluate(output + "-events.tar.gz", task) else: print >> sys.stderr, "No BioNLP shared task format conversion" finalXMLFile = self.getWorkFile(None, [ workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz" ]) if finalXMLFile != None: shutil.copy2(finalXMLFile, output + "-pred.xml.gz") self.deleteTempWorkDir() self.exitState()
def trainUnmergingDetector(self): xml = None if not self.unmerging: print >> sys.stderr, "No unmerging" if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging: # Self-classified train data for unmerging if self.doUnmergingSelfTraining: # This allows limiting to a subcorpus triggerStyle = copy.copy( Parameters.get(self.triggerExampleStyle)) edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle)) unmergingStyle = Parameters.get(self.unmergingExampleStyle) if "sentenceLimit" in unmergingStyle and unmergingStyle[ "sentenceLimit"]: triggerStyle["sentenceLimit"] = unmergingStyle[ "sentenceLimit"] edgeStyle["sentenceLimit"] = unmergingStyle[ "sentenceLimit"] # Build the examples xml = self.triggerDetector.classifyToXML( self.trainData, self.model, None, self.workDir + "unmerging-extra-", exampleStyle=triggerStyle) #, recallAdjust=0.5) xml = self.edgeDetector.classifyToXML( xml, self.model, None, self.workDir + "unmerging-extra-", exampleStyle=edgeStyle) #, recallAdjust=0.5) assert xml != None EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse) else: print >> sys.stderr, "No self-training for unmerging" if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging: # Unmerging example generation GOLD_TEST_FILE = self.optData.replace("-nodup", "") GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "") if self.doUnmergingSelfTraining: if xml == None: xml = self.workDir + "unmerging-extra-edge-pred.xml.gz" self.unmergingDetector.buildExamples( self.model, [ self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml] ], [ self.workDir + "unmerging-opt-examples.gz", self.workDir + "unmerging-train-examples.gz" ], [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None else: self.unmergingDetector.buildExamples( self.model, [ self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "") ], [ self.workDir + "unmerging-opt-examples.gz", self.workDir + "unmerging-train-examples.gz" ], [GOLD_TEST_FILE, GOLD_TRAIN_FILE], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True) if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.beginModel( None, self.model, self.workDir + "unmerging-train-examples.gz", self.workDir + "unmerging-opt-examples.gz") if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.endModel( None, self.model, self.workDir + "unmerging-opt-examples.gz") print >> sys.stderr, "Adding unmerging classifier model to test-set event model" if self.combinedModel != None: self.combinedModel.addStr( "unmerging-example-style", self.model.getStr("unmerging-example-style")) self.combinedModel.insert( self.model.get("unmerging-ids.classes"), "unmerging-ids.classes") self.combinedModel.insert( self.model.get("unmerging-ids.features"), "unmerging-ids.features") self.unmergingDetector.addClassifierModel( self.combinedModel, self.model.get("unmerging-classifier-model", True), self.model.getStr("unmerging-classifier-parameter")) self.combinedModel.save()
def classify(self, data, model, output, parse=None, task=None, fromStep=None, toStep=None): BINARY_RECALL_MODE = False # TODO: make a parameter xml = None self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse) self.enterState( self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep) #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep) self.model = self.openModel(self.model, "r") if self.checkStep("TRIGGERS"): xml = self.triggerDetector.classifyToXML( self.classifyData, self.model, None, output + "-", parse=self.parse, recallAdjust=float( self.getStr("recallAdjustParameter", self.model))) if self.checkStep("EDGES"): xml = self.getWorkFile(xml, output + "-recall-adjusted.xml.gz") xml = self.edgeDetector.classifyToXML(xml, self.model, None, output + "-", parse=self.parse) assert xml != None if self.parse == None: edgeParse = self.getStr(self.edgeDetector.tag + "parse", self.model) else: edgeParse = self.parse #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse) EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, None, edgeParse) if self.checkStep("UNMERGING"): if self.model.hasMember("unmerging-classifier-model"): #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz") # To avoid running out of memory, always use file on disk xml = self.getWorkFile(None, output + "-edge-pred.xml.gz") goldData = None if type(self.classifyData) in types.StringTypes: if os.path.exists(self.classifyData.replace("-nodup", "")): goldData = self.classifyData.replace("-nodup", "") xml = self.unmergingDetector.classifyToXML(xml, self.model, None, output + "-", goldData=goldData, parse=self.parse) else: print >> sys.stderr, "No model for unmerging" if self.checkStep("MODIFIERS"): if self.model.hasMember("modifier-classifier-model"): xml = self.getWorkFile(xml, [ output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz" ]) xml = self.modifierDetector.classifyToXML(xml, self.model, None, output + "-", parse=self.parse) else: print >> sys.stderr, "No model for modifier detection" if self.checkStep("ST-CONVERT"): xml = self.getWorkFile(xml, [ output + "-modifier-pred.xml.gz", output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz" ]) STFormat.ConvertXML.toSTFormat(xml, output + "-events.tar.gz", outputTag="a2", writeScores=self.stWriteScores) if self.stEvaluator != None: task = self.task if task == None: task = self.getStr(self.edgeDetector.tag + "task", self.model) self.stEvaluator.evaluate(output + "-events.tar.gz", task) self.exitState()
def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None): # example directionality if self.styles.get("directed") == None and self.styles.get("undirected") == None: # determine directedness from corpus examplesAreDirected = self.structureAnalyzer.hasDirectedTargets() if self.structureAnalyzer != None else True elif self.styles.get("directed"): assert self.styles.get("undirected") in [None, False] examplesAreDirected = True elif self.styles.get("undirected"): assert self.styles.get("directed") in [None, False] examplesAreDirected = False # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities #entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) # paths = None # if not self.styles.get("no_path"): # undirected = sentenceGraph.dependencyGraph.toUndirected() # paths = undirected # if self.styles.get("filter_shortest_path") != None: # For DDI use filter_shortest_path=conj_and # paths.resetAnalyses() # just in case # paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) dg = sentenceGraph.dependencyGraph undirected = dg.toUndirected() edgeCounts = {x:len(dg.getInEdges(x) + dg.getOutEdges(x)) for x in sentenceGraph.tokens} tokens, tokenMap = self.getTokenFeatures(sentenceGraph) # Generate examples based on interactions between entities or interactions between tokens if self.styles.get("token_nodes"): loopRange = len(tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles.get("token_nodes"): tI = tokens[i]["element"] tJ = tokens[j]["element"] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles.get("skip_extra_triggers"): if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles.get("headsOnly"): if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if examplesAreDirected: self.buildExample(examples, tI, tJ, eI, eJ, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts) self.buildExample(examples, tJ, tI, eJ, eI, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts) else: if tokenMap[tJ]["index"] < tokenMap[tI]["index"]: tI, tJ = tJ, tI eI, eJ = eJ, eI self.buildExample(examples, tI, tJ, eI, eJ, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts, False)
#Gazetteer.run(TRAIN_FILE, "gazetteer-train") #GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000", "trigger-test-classifications") evaluator = Ev.evaluate("trigger-test-examples", "trigger-test-classifications", "trigger-ids.class_names") #evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\ # "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0] ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE, "test-predicted-triggers.xml", "trigger-ids.class_names", PARSE_TOK, PARSE_TOK) # RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml") # ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml") # ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True) # EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK) ix.splitMergedElements("test-predicted-triggers.xml", "test-predicted-triggers-split.xml") ix.recalculateIds("test-predicted-triggers-split.xml", "test-predicted-triggers-split-recids.xml", True) EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)