Example #1
0
 def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag+"parse", model)
     workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
     xml = self.classifyToXML(data, model, None, workOutputTag, 
         model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output+"-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]: #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" 
         Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]: #self.stEvaluator != None:
             if task == None: 
                 task = self.getStr(self.tag+"task", model)
             self.stEvaluator.evaluate(output+"-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
Example #2
0
def evaluateChemProt(xml, gold):
    EvaluateIXML.run(AveragingMultiClassEvaluator, xml, gold, "McCC")
    preprocessor = Preprocessor(steps=["EXPORT_CHEMPROT"])
    tempDir = tempfile.mkdtemp()
    print >> sys.stderr, "Using temporary evaluation directory", tempDir
    tsvPath = os.path.join(tempDir, "predictions.tsv")
    preprocessor.process(xml, tsvPath)
    ChemProtEvaluator().evaluateTSV(tsvPath, tempDir)
    print >> sys.stderr, "Removing temporary evaluation directory", tempDir
    shutil.rmtree(tempDir)
Example #3
0
def evaluateChemProt(xml, gold):
    EvaluateIXML.run(AveragingMultiClassEvaluator, xml, gold, "McCC")
    preprocessor = Preprocessor(steps=["EXPORT_CHEMPROT"])
    tempDir = tempfile.mkdtemp()
    print >> sys.stderr, "Using temporary evaluation directory", tempDir
    tsvPath = os.path.join(tempDir, "predictions.tsv")
    preprocessor.process(xml, tsvPath)
    ChemProtEvaluator().evaluateTSV(tsvPath, tempDir)
    print >> sys.stderr, "Removing temporary evaluation directory", tempDir
    shutil.rmtree(tempDir)
Example #4
0
 def classify(self, data, model, output, parse=None, task=None):
     self.enterState(self.STATE_CLASSIFY)
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag+"parse", model)
     if task == None: task = self.getStr(self.tag+"task", model)
     xml = self.classifyToXML(data, model, None, output + "-", 
         model.get(self.tag+"classifier-model"), None, parse, float(model.get("recallAdjustParameter")))
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     STFormat.ConvertXML.toSTFormat(xml, output+".tar.gz", outputTag="a2")
     if self.stEvaluator != None:
         self.stEvaluator.evaluate(output+".tar.gz", task)
     self.exitState()
Example #5
0
 def trainUnmergingDetector(self):
     xml = None
     if not self.unmerging:
         print >> sys.stderr, "No unmerging"
     if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging:
         # Self-classified train data for unmerging
         if self.doUnmergingSelfTraining:
             # This allows limiting to a subcorpus
             triggerStyle = copy.copy(Parameters.get(self.triggerExampleStyle))
             edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle))
             unmergingStyle = Parameters.get(self.unmergingExampleStyle)
             if "sentenceLimit" in unmergingStyle and unmergingStyle["sentenceLimit"]:
                 triggerStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"]
                 edgeStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"]
             # Build the examples
             xml = self.triggerDetector.classifyToXML(self.trainData, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=triggerStyle)#, recallAdjust=0.5)
             xml = self.edgeDetector.classifyToXML(xml, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=edgeStyle)#, recallAdjust=0.5)
             assert xml != None
             EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse)
         else:
             print >> sys.stderr, "No self-training for unmerging"
     if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging:
         # Unmerging example generation
         GOLD_TEST_FILE = self.optData.replace("-nodup", "")
         GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "")
         if self.doUnmergingSelfTraining:
             if xml == None: 
                 xml = self.workDir+"unmerging-extra-edge-pred.xml.gz"
             self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml]], 
                                                  [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], 
                                                  [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]], 
                                                  exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True)
             xml = None
         else:
             self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "")], 
                                                  [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], 
                                                  [GOLD_TEST_FILE, GOLD_TRAIN_FILE], 
                                                  exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True)
             xml = None
         #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True)
     if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging:
         self.unmergingDetector.beginModel(None, self.model, self.workDir+"unmerging-train-examples.gz", self.workDir+"unmerging-opt-examples.gz")
     if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging:
         self.unmergingDetector.endModel(None, self.model, self.workDir+"unmerging-opt-examples.gz")
         print >> sys.stderr, "Adding unmerging classifier model to test-set event model"
         if self.combinedModel != None:
             self.combinedModel.addStr("unmerging-example-style", self.model.getStr("unmerging-example-style"))
             self.combinedModel.insert(self.model.get("unmerging-ids.classes"), "unmerging-ids.classes")
             self.combinedModel.insert(self.model.get("unmerging-ids.features"), "unmerging-ids.features")
             self.unmergingDetector.addClassifierModel(self.combinedModel, self.model.get("unmerging-classifier-model", True), 
                                                       self.model.getStr("unmerging-classifier-parameter"))
             self.combinedModel.save()
Example #6
0
 def classify(self, data, model, output, parse=None, task=None):
     self.enterState(self.STATE_CLASSIFY)
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag + "parse", model)
     if task == None: task = self.getStr(self.tag + "task", model)
     xml = self.classifyToXML(data, model, None, output + "-",
                              model.get(self.tag + "classifier-model"),
                              None, parse,
                              float(model.get("recallAdjustParameter")))
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     STFormat.ConvertXML.toSTFormat(xml, output + ".tar.gz", outputTag="a2")
     if self.stEvaluator != None:
         self.stEvaluator.evaluate(output + ".tar.gz", task)
     self.exitState()
Example #7
0
 def classify(self,
              data,
              model,
              output,
              parse=None,
              task=None,
              goldData=None,
              workDir=None,
              fromStep=None,
              omitSteps=None,
              validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag + "parse", model)
     workOutputTag = os.path.join(self.workDir,
                                  os.path.basename(output) + "-")
     xml = self.classifyToXML(
         data, model, None, workOutputTag,
         model.get(self.tag + "classifier-model", defaultIfNotExist=None),
         goldData, parse,
         float(model.getStr("recallAdjustParameter",
                            defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output + "-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag + self.tag + "pred.xml.gz",
                      output + "-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]:  #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz"
         Utils.STFormat.ConvertXML.toSTFormat(
             xml,
             output + "-events" + extension,
             outputTag=stParams["a2Tag"],
             writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]:  #self.stEvaluator != None:
             if task == None:
                 task = self.getStr(self.tag + "task", model)
             self.stEvaluator.evaluate(output + "-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
Example #8
0
 def evaluateGrid(self, xml, params, bestResults):
     if xml != None:                
         # TODO: Where should the EvaluateInteractionXML evaluator come from?
         EIXMLResult = EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.optData, self.parse)
         # Convert to ST-format
         STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-flat-geniaformat", "a2") #getA2FileTag(options.task, subTask))
         stFormatDir = self.workDir+"grid-flat-geniaformat"
         
         if self.unmerging:
             xml = self.unmergingDetector.classifyToXML(xml, self.model, None, self.workDir+"grid-", goldData=self.optData.replace("-nodup", ""))
             STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-unmerging-geniaformat", "a2")
             stFormatDir = self.workDir+"grid-unmerging-geniaformat"
         stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task)
         if stEvaluation != None:
             if bestResults == None or stEvaluation[0] > bestResults[1][0]:
                 bestResults = (params, stEvaluation, stEvaluation[0])
         else:
             if bestResults == None or EIXMLResult.getData().fscore > bestResults[1].getData().fscore:
                 bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore)
         shutil.rmtree(self.workDir+"grid-flat-geniaformat")
         if os.path.exists(self.workDir+"grid-unmerging-geniaformat"):
             shutil.rmtree(self.workDir+"grid-unmerging-geniaformat")
     else:
         print >> sys.stderr, "No predicted edges"
     return bestResults
Example #9
0
 def evaluateGrid(self, xml, params, bestResults):
     if xml != None:                
         # TODO: Where should the EvaluateInteractionXML evaluator come from?
         EIXMLResult = EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.optData, self.parse)
         # Convert to ST-format
         if self.unmerging:
             xml = self.unmergingDetector.classifyToXML(xml, self.model, None, self.workDir+"grid-", goldData=self.optData)
             #self.structureAnalyzer.validate(xml)
             if self.bioNLPSTParams["evaluate"]:
                 Utils.STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-unmerging-geniaformat", "a2")
                 stFormatDir = self.workDir+"grid-unmerging-geniaformat"
         elif self.bioNLPSTParams["evaluate"]:
             #self.structureAnalyzer.validate(xml)
             Utils.STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-flat-geniaformat", "a2") #getA2FileTag(options.task, subTask))
             stFormatDir = self.workDir+"grid-flat-geniaformat"
         # Evaluation
         # Attempt shared task evaluation
         stEvaluation = None
         if self.bioNLPSTParams["evaluate"]:
             stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task)
         if stEvaluation != None:
             if bestResults == None or stEvaluation[0] > bestResults[1][0]:
                 bestResults = (params, stEvaluation, stEvaluation[0])
         else: # If shared task evaluation was not done (failed or not requested) fall back to internal evaluation
             if bestResults == None or EIXMLResult.getData().fscore > bestResults[1].getData().fscore:
                 bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore)
         # Remove ST-format files
         if os.path.exists(self.workDir+"grid-flat-geniaformat"):
             shutil.rmtree(self.workDir+"grid-flat-geniaformat")
         if os.path.exists(self.workDir+"grid-unmerging-geniaformat"):
             shutil.rmtree(self.workDir+"grid-unmerging-geniaformat")
     else:
         print >> sys.stderr, "No predicted edges"
     return bestResults
Example #10
0
 def classify(self, data, model, output, parse=None, task=None, fromStep=None, toStep=None):
     BINARY_RECALL_MODE = False # TODO: make a parameter
     xml = None
     self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse)
     self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep)
     #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep)
     self.model = self.openModel(self.model, "r")
     if self.checkStep("TRIGGERS"):
         xml = self.triggerDetector.classifyToXML(self.classifyData, self.model, None, output + "-", parse=self.parse, recallAdjust=float(self.getStr("recallAdjustParameter", self.model)))
     if self.checkStep("EDGES"):
         xml = self.getWorkFile(xml, output + "-recall-adjusted.xml.gz")
         xml = self.edgeDetector.classifyToXML(xml, self.model, None, output + "-", parse=self.parse)
         assert xml != None
         if self.parse == None:
             edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model)
         else:
             edgeParse = self.parse
         #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
         EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, None, edgeParse)
     if self.checkStep("UNMERGING"):
         if self.model.hasMember("unmerging-classifier-model"):
             #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz")
             # To avoid running out of memory, always use file on disk
             xml = self.getWorkFile(None, output + "-edge-pred.xml.gz")
             goldData = None
             if type(self.classifyData) in types.StringTypes:
                 if os.path.exists(self.classifyData.replace("-nodup", "")):
                     goldData = self.classifyData.replace("-nodup", "")
             xml = self.unmergingDetector.classifyToXML(xml, self.model, None, output + "-", goldData=goldData, parse=self.parse)
         else:
             print >> sys.stderr, "No model for unmerging"
     if self.checkStep("MODIFIERS"):
         if self.model.hasMember("modifier-classifier-model"):
             xml = self.getWorkFile(xml, [output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz"])
             xml = self.modifierDetector.classifyToXML(xml, self.model, None, output + "-", parse=self.parse)
         else:
             print >> sys.stderr, "No model for modifier detection"
     if self.checkStep("ST-CONVERT"):
         xml = self.getWorkFile(xml, [output + "-modifier-pred.xml.gz", output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz"])
         STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2", writeScores=self.stWriteScores)
         if self.stEvaluator != None:
             task = self.task
             if task == None:
                 task = self.getStr(self.edgeDetector.tag+"task", self.model)
             self.stEvaluator.evaluate(output + "-events.tar.gz", task)
     self.exitState()
Example #11
0
 def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag+"parse", model)
     if task == None: task = self.getStr(self.tag+"task", model)
     workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
     xml = self.classifyToXML(data, model, None, workOutputTag, 
         model.get(self.tag+"classifier-model"), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
     shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]: #self.useBioNLPSTFormat:
         Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2")
         if stParams["evaluate"]: #self.stEvaluator != None:
             self.stEvaluator.evaluate(output+"-events.tar.gz", task)
     self.deleteTempWorkDir()
     self.exitState()
 def evaluateGrid(self, xml, params, bestResults):
     #traceback.print_stack()
     #pdb.set_trace()
     if xml != None:
         # TODO: Where should the EvaluateInteractionXML evaluator come from?
         EIXMLResult = EvaluateInteractionXML.run(
             self.edgeDetector.evaluator, xml, self.optData, self.parse)
         # Convert to ST-format
         if self.unmerging:
             xml = self.unmergingDetector.classifyToXML(
                 xml,
                 self.model,
                 None,
                 self.workDir + "grid-",
                 goldData=self.optData)
             #self.structureAnalyzer.validate(xml)
             if self.bioNLPSTParams["evaluate"]:
                 Utils.STFormat.ConvertXML.toSTFormat(
                     xml, self.workDir + "grid-unmerging-geniaformat", "a2")
                 stFormatDir = self.workDir + "grid-unmerging-geniaformat"
         elif self.bioNLPSTParams["evaluate"]:
             #self.structureAnalyzer.validate(xml)
             Utils.STFormat.ConvertXML.toSTFormat(
                 xml, self.workDir + "grid-flat-geniaformat",
                 "a2")  #getA2FileTag(options.task, subTask))
             stFormatDir = self.workDir + "grid-flat-geniaformat"
         # Evaluation
         # Attempt shared task evaluation
         stEvaluation = None
         if self.bioNLPSTParams["evaluate"]:
             stEvaluation = self.stEvaluator.evaluate(
                 stFormatDir, self.task)
         if stEvaluation != None:
             if bestResults == None or stEvaluation[0] > bestResults[1][0]:
                 bestResults = (params, stEvaluation, stEvaluation[0])
         else:  # If shared task evaluation was not done (failed or not requested) fall back to internal evaluation
             if bestResults == None or EIXMLResult.getData(
             ).fscore > bestResults[1].getData().fscore:
                 bestResults = (params, EIXMLResult,
                                EIXMLResult.getData().fscore)
         # Remove ST-format files
         if os.path.exists(self.workDir + "grid-flat-geniaformat"):
             shutil.rmtree(self.workDir + "grid-flat-geniaformat")
         if os.path.exists(self.workDir + "grid-unmerging-geniaformat"):
             shutil.rmtree(self.workDir + "grid-unmerging-geniaformat")
     else:
         print >> sys.stderr, "No predicted edges"
     return bestResults
Example #13
0
    def evaluateGrid(self, xml, params, bestResults):
        if xml != None:
            # TODO: Where should the EvaluateInteractionXML evaluator come from?
            EIXMLResult = EvaluateInteractionXML.run(
                self.edgeDetector.evaluator, xml, self.optData, self.parse)
            # Convert to ST-format
            STFormat.ConvertXML.toSTFormat(
                xml, self.workDir + "grid-flat-geniaformat",
                "a2")  #getA2FileTag(options.task, subTask))
            stFormatDir = self.workDir + "grid-flat-geniaformat"

            if self.unmerging:
                xml = self.unmergingDetector.classifyToXML(
                    xml,
                    self.model,
                    None,
                    self.workDir + "grid-",
                    goldData=self.optData.replace("-nodup", ""))
                STFormat.ConvertXML.toSTFormat(
                    xml, self.workDir + "grid-unmerging-geniaformat", "a2")
                stFormatDir = self.workDir + "grid-unmerging-geniaformat"
            stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task)
            if stEvaluation != None:
                if bestResults == None or stEvaluation[0] > bestResults[1][0]:
                    bestResults = (params, stEvaluation, stEvaluation[0])
            else:
                if bestResults == None or EIXMLResult.getData(
                ).fscore > bestResults[1].getData().fscore:
                    bestResults = (params, EIXMLResult,
                                   EIXMLResult.getData().fscore)
            shutil.rmtree(self.workDir + "grid-flat-geniaformat")
            if os.path.exists(self.workDir + "grid-unmerging-geniaformat"):
                shutil.rmtree(self.workDir + "grid-unmerging-geniaformat")
        else:
            print >> sys.stderr, "No predicted edges"
        return bestResults
Example #14
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                sentenceGraph.entities, goldGraph.entities)

        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get(
                    "isName"
            ) == "True":  # known data which can be used for features
                namedEntityCount += 1
            else:  # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1

            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_" + text):
                    bagOfWords["spec_bow_" + text] = 0
                bagOfWords["spec_bow_" + text] += 1
                bagOfWords["spec_sentence"] = 1

        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("isName") == "True":
                continue

            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
            #IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
            #ENDIF
            #features[self.featureSet.getId(entityType)] = 1

            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1

            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)

            extra = {
                "xtype": "task3",
                "t3type": task3Type,
                "t": token.get("id"),
                "entity": entity.get("id")
            }
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Example #15
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        
        if self.styles["trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
            
        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected
        
        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                    
                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()
        
        #return examples
        return exampleIndex
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities)
        
        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get("given") == "True": # known data which can be used for features
                namedEntityCount += 1
            else: # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            
            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_"+text):
                    bagOfWords["spec_bow_"+text] = 0
                bagOfWords["spec_bow_"+text] += 1
                bagOfWords["spec_sentence"] = 1
        
        bowFeatures = {}
        for k,v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("given") == "True":
                continue
            
            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)  
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
#IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be 
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
#ENDIF            
            #features[self.featureSet.getId(entityType)] = 1
            
            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            
            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1
            
            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
            
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)
             
            extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")}
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1            
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Example #17
0
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, 
          folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None})
    models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"})
    exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None})
    folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    useKerasDetector = False
    if detector != None and "keras" in detector.lower():
        print >> sys.stderr, "Using a Keras Detector"
        useKerasDetector = True
        if detector.lower() == "keras":
            detector = None
    detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, 
        bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector)   
    # Get corpus subsets   
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None: 
        task = task.replace("-FULL", "")
    if "." in task:
        _, subTask = getSubTask(task)
        if subTask != 3:
            processModifiers = False
    # Preprocess the corpus if required
    if corpusPreprocessing != None:
        preprocessor = Preprocessor(steps=corpusPreprocessing)
        assert preprocessor.steps[0].name == "MERGE_SETS"
        assert preprocessor.steps[-1].name == "DIVIDE_SETS"
        preprocessedCorpusDir = os.path.join(output, "corpus")
        #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles}
        preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task))
        #inputFiles = outputFiles
        for setName in inputFiles.keys():
            if inputFiles[setName] != None:
                inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector, evaluator=evaluator)
    evaluator, evaluatorName = importClass(evaluator, "evaluator")
    detector = detector() # initialize object
    if evaluator != None:
        print >> sys.stderr, "Using evaluator", evaluator.__name__
        detector.evaluator = evaluator
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if not isinstance(detector, EventDetector):
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if evaluatorName != None:
                    model.addStr("detector", evaluatorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        removalScope = "non-given"
        if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]):
            removalScope = "all"
        elif "Edge" in detector.__class__.__name__:
            removalScope = "interactions"
        detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
        print >> sys.stderr, "*** Evaluate empty devel classification ***"
        if os.path.exists("classification-empty/devel-empty-pred.xml.gz"):
            EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse)
        else:
            print >> sys.stderr, "No output file for evaluation"
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" 
                Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2")
    # Stop logging
    if log != None:
        Stream.closeLog(log)
Example #18
0
    def classify(self, data, model, output, parse=None, task=None, goldData=None, fromStep=None, toStep=None, omitSteps=None, workDir=None):
        #BINARY_RECALL_MODE = False # TODO: make a parameter
        xml = None
        model = self.openModel(model, "r")
        self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse)
        self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep, omitSteps)
        #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep)
        self.setWorkDir(workDir)
        if workDir == None:
            self.setTempWorkDir()
        workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
        self.model = self.openModel(self.model, "r")
        stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
        if self.checkStep("TRIGGERS"):
            xml = self.triggerDetector.classifyToXML(self.classifyData, self.model, None, workOutputTag, goldData=goldData, parse=self.parse, recallAdjust=float(self.getStr("recallAdjustParameter", self.model)))
        if self.checkStep("EDGES"):
            xml = self.getWorkFile(xml, workOutputTag + "trigger-pred.xml.gz")
            xml = self.edgeDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse)
            assert xml != None
            if self.parse == None:
                edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model)
            else:
                edgeParse = self.parse
            #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
            if goldData != None:
                EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse)
            else:
                EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
        if self.checkStep("UNMERGING"):
            if self.model.getStr("unmerging-classifier-parameter", None) != None: #self.model.hasMember("unmerging-classifier-model"):
                #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz")
                # To avoid running out of memory, always use file on disk
                xml = self.getWorkFile(None, workOutputTag + "edge-pred.xml.gz")
                #goldData = None
                #if type(self.classifyData) in types.StringTypes:
                #    if os.path.exists(self.classifyData.replace("-nodup", "")):
                #        goldData = self.classifyData.replace("-nodup", "")
                xml = self.unmergingDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse)
                # Evaluate after unmerging
                if self.parse == None:
                    edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model)
                else:
                    edgeParse = self.parse
                if goldData != None:
                    EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse)
                else:
                    EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
            else:
                print >> sys.stderr, "No model for unmerging"
        if self.checkStep("MODIFIERS"):
            if self.model.hasMember("modifier-classifier-model"):
                xml = self.getWorkFile(xml, [workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
                xml = self.modifierDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse)
            else:
                print >> sys.stderr, "No model for modifier detection"
#        if self.checkStep("VALIDATE"):
#            xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
#            self.structureAnalyzer.load(model)
#            self.structureAnalyzer.validate(xml)
#            ETUtils.write(xml, workOutputTag + "validate-pred.xml.gz")
        if self.checkStep("ST-CONVERT"):
            if stParams["convert"]:
                #xml = self.getWorkFile(xml, [workOutputTag + "validate-pred.xml.gz", workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
                xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
                Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True))
                if stParams["evaluate"]: #self.stEvaluator != None:
                    task = self.task
                    if task == None:
                        task = self.getStr(self.edgeDetector.tag+"task", self.model)
                    self.stEvaluator.evaluate(output + "-events.tar.gz", task)
            else:
                print >> sys.stderr, "No BioNLP shared task format conversion"
        finalXMLFile = self.getWorkFile(None, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
        if finalXMLFile != None:
            shutil.copy2(finalXMLFile, output+"-pred.xml.gz")
        self.deleteTempWorkDir()
        self.exitState()
Example #19
0
CLASSIFIER_PARAMS="c:25000,50000,87500"
WORKDIR="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest"
PARSE_TOK="split-Charniak-Lease"

workdir(WORKDIR, False)
log()


# Trigger detection

#Gazetteer.run(TRAIN_FILE, "gazetteer-train")
#GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids")

GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids")
Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000", "trigger-test-classifications")
evaluator = Ev.evaluate("trigger-test-examples", "trigger-test-classifications", "trigger-ids.class_names")

#evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\
#    "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0]

ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE, "test-predicted-triggers.xml", "trigger-ids.class_names", PARSE_TOK, PARSE_TOK)

# RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml")
# ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml")
# ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True)
# EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)

ix.splitMergedElements("test-predicted-triggers.xml", "test-predicted-triggers-split.xml")
ix.recalculateIds("test-predicted-triggers-split.xml", "test-predicted-triggers-split-recids.xml", True)
EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)
Example #20
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        # example directionality
        if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus
            examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True
        elif self.styles["directed"]:
            assert self.styles["undirected"] in [None, False]
            examplesAreDirected = True
        elif self.styles["undirected"]:
            assert self.styles["directed"] in [None, False]
            examplesAreDirected = False
        
        if not self.styles["no_trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
#         if self.styles["sdb_merge"]:
#             self.determineNonOverlappingTypes(structureAnalyzer)
            
        # Filter entities, if needed
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        entityToGold = None
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            paths = undirected
            if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and
                paths.resetAnalyses() # just in case
                paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]})
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["token_nodes"]:
            loopRange = len(sentenceGraph.tokens)
        else:
            loopRange = len(entities)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["token_nodes"]:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                else:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected)
                for categoryName, features, extra in examples:
                    # make example
                    if self.styles["binary"]:
                        if categoryName != "neg":
                            category = 1
                        else:
                            category = -1
                        extra["categoryName"] = "i"
                    else:
                        category = self.classSet.getId(categoryName)
                    example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra]
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1

        return exampleIndex
Example #21
0
 def makeExampleGraphWithGold(self, builder, sentenceGraph, goldGraph, sentenceIndex):
     exampleGraph = NX10.MultiDiGraph()
     for token in goldGraph.tokens:
         exampleGraph.add_node(token)
     arcStyles = {}
     labelStyles = {}
     extraByToken = {}
     edgeTypes = {}
     stats = {"entities":0,"edges":0,"tp":0,"fp":0,"tn":0,"fn":0}
     
     entityMap = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities, goldGraph.tokens)
     tokenMap = self.getTokenMap(sentenceGraph, goldGraph)
     toEntitiesWithPredictions = set()
     for entityFrom, entitiesTo in entityMap.iteritems():
         stats["entities"] += 1
         entityFromHeadToken = sentenceGraph.entityHeadTokenByEntity[entityFrom]
         for entityTo in entitiesTo:
             toEntitiesWithPredictions.add(entityTo)
             entityToHeadToken = goldGraph.entityHeadTokenByEntity[entityTo]
             style = None
             eFromType = entityFrom.get("type")
             eToType = entityTo.get("type")
             if extraByToken.has_key(entityToHeadToken):
                 style = extraByToken[entityToHeadToken]
             if eFromType == eToType:
                 if eToType != "neg":
                     if style == None:
                         style = [entityTo.get("type"),{"fill":"green"}]
                     elif style[1]["fill"] == "#79BAEC":
                         style = [entityTo.get("type"),{"fill":"green"}]
                     if entityTo.get("isName") == "True":
                         style = [entityTo.get("type"),{"fill":"brown"}]
                     else:
                         stats["tp"] += 1
             else:
                 if eToType == "neg":
                     pass
             extraByToken[entityToHeadToken] = style
         if len(entitiesTo) == 0:
             stats["fp"] += 1
             if extraByToken.has_key(tokenMap[entityFromHeadToken]):
                 style = extraByToken[tokenMap[entityFromHeadToken]]
                 if style[1]["fill"] != "green":
                     style = [entityFrom.get("type"),{"fill":"red"}]
                 extraByToken[tokenMap[entityFromHeadToken]] = style
             else:
                 extraByToken[tokenMap[entityFromHeadToken]] = [entityFrom.get("type"),{"fill":"red"}]
     for entity in goldGraph.entities:
         if entity not in toEntitiesWithPredictions:
             stats["fn"] += 1
             extraByToken[goldGraph.entityHeadTokenByEntity[entity]] = [entity.get("type"),{"fill":"#79BAEC"}]
     
     toInteractionsWithPredictions = set()            
     for interactionFrom in sentenceGraph.interactions:
         if interactionFrom.get("type") == "neg":
             continue
         stats["edges"] += 1
         
         e1s = entityMap[sentenceGraph.entitiesById[interactionFrom.get("e1")]]
         e1Ids = []
         for e1 in e1s:
             e1Ids.append(e1.get("id"))
         e2s = entityMap[sentenceGraph.entitiesById[interactionFrom.get("e2")]]
         e2Ids = []
         for e2 in e2s:
             e2Ids.append(e2.get("id"))
             
         t1 = tokenMap[sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[interactionFrom.get("e1")]]]
         t2 = tokenMap[sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[interactionFrom.get("e2")]]]
         iFromType = interactionFrom.get("type")
         
         found = False
         for interactionTo in goldGraph.interactions:
             if interactionTo.get("e1") in e1Ids and interactionTo.get("e2") in e2Ids:
                 toInteractionsWithPredictions.add(interactionTo)
                 
                 iToType = interactionTo.get("type")
                 exampleGraph.add_edge(t1, t2, element=interactionFrom)
                 #edge = exampleGraph.get_edge(t1, t2, data=True)
                 edge = self.getNXEdge(exampleGraph, t1, t2, interactionFrom)
                 
                 if t1 != t2:
                     if iToType == iFromType:
                         edge[2]["arcStyles"] = {"stroke":"green"}
                         edge[2]["labelStyles"] = {"fill":"green"}
                         stats["tp"] += 1
                     else:
                         edge[2]["arcStyles"] = {"stroke":"red"}
                         edge[2]["labelStyles"] = {"fill":"red"}
                         stats["fp"] += 1
                 found = True
         if not found: # false positive prediction
             if t1 != t2:
                 exampleGraph.add_edge(t1, t2, element=interactionFrom)
                 edge = self.getNXEdge(exampleGraph, t1, t2, interactionFrom)
                 edge[2]["arcStyles"] = {"stroke":"red"}
                 edge[2]["labelStyles"] = {"fill":"red"}
                 stats["fp"] += 1
     for interactionTo in goldGraph.interactions:
         if interactionTo not in toInteractionsWithPredictions: # false negative gold
             t1 = goldGraph.entityHeadTokenByEntity[goldGraph.entitiesById[interactionTo.get("e1")]]
             t2 = goldGraph.entityHeadTokenByEntity[goldGraph.entitiesById[interactionTo.get("e2")]]                
             if t1 != t2:
                 exampleGraph.add_edge(t1, t2, element=interactionTo)
                 edge = self.getNXEdge(exampleGraph, t1, t2, interactionTo)
                 edge[2]["arcStyles"] = {"stroke":"#79BAEC"}
                 edge[2]["labelStyles"] = {"fill":"#79BAEC"}
                 stats["fn"] += 1
     
     builder.header("Classification",4)
     svgTokens = GraphToSVG.tokensToSVG(goldGraph.tokens,False,None,extraByToken)
     #arcStyles, labelStyles = self.getMatchingEdgeStyles(exampleGraph, sentenceGraph.interactionGraph, "green", "red" )
     svgEdges = GraphToSVG.edgesToSVG(svgTokens, exampleGraph, "type", None)
     sentenceId = sentenceGraph.getSentenceId()
     svgElement = GraphToSVG.writeSVG(svgTokens, svgEdges, self.outDir+"/svg/"+sentenceId+"-"+str(sentenceIndex)+"_learned.svg")
     builder.svg("../svg/" + sentenceId + "-"+str(sentenceIndex)+"_learned.svg",svgElement.attrib["width"],svgElement.attrib["height"],id="learned_graph")
     builder.lineBreak()
     return stats
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0

        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]:
            self.evexFeatureBuilder.initSentence(sentenceGraph)

        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped",
                                   len(sentenceGraph.entities) - len(entities))

        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                entities, goldGraph.entities)

        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected

        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]

        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange - 1):
            for j in range(i + 1, loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get(
                                "source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(
                            sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue

                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eI.get("type") + "/" +
                                                     eJ.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tI, tJ, paths, sentenceGraph,
                                              categoryName, exampleIndex, eI,
                                              eJ)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()

                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eJ.get("type") + "/" +
                                                     eI.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tJ, tI, paths, sentenceGraph,
                                              categoryName, exampleIndex, eJ,
                                              eI)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths,
                                                       sentenceGraph,
                                                       categoryName,
                                                       exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(
                            tJ, tI, paths, sentenceGraph, categoryName,
                            exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()

        #return examples
        return exampleIndex
Example #23
0
    def classify(self,
                 data,
                 model,
                 output,
                 parse=None,
                 task=None,
                 goldData=None,
                 fromStep=None,
                 toStep=None,
                 omitSteps=None,
                 workDir=None):
        #BINARY_RECALL_MODE = False # TODO: make a parameter
        xml = None
        model = self.openModel(model, "r")
        self.initVariables(classifyData=data,
                           model=model,
                           xml=None,
                           task=task,
                           parse=parse)
        self.enterState(
            self.STATE_CLASSIFY,
            ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"],
            fromStep, toStep, omitSteps)
        #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep)
        self.setWorkDir(workDir)
        if workDir == None:
            self.setTempWorkDir()
        workOutputTag = os.path.join(self.workDir,
                                     os.path.basename(output) + "-")
        self.model = self.openModel(self.model, "r")
        stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
        if self.checkStep("TRIGGERS"):
            xml = self.triggerDetector.classifyToXML(
                self.classifyData,
                self.model,
                None,
                workOutputTag,
                goldData=goldData,
                parse=self.parse,
                recallAdjust=float(
                    self.getStr("recallAdjustParameter", self.model)))
        if self.checkStep("EDGES"):
            xml = self.getWorkFile(xml, workOutputTag + "trigger-pred.xml.gz")
            xml = self.edgeDetector.classifyToXML(xml,
                                                  self.model,
                                                  None,
                                                  workOutputTag,
                                                  goldData=goldData,
                                                  parse=self.parse)
            assert xml != None
            if self.parse == None:
                edgeParse = self.getStr(self.edgeDetector.tag + "parse",
                                        self.model)
            else:
                edgeParse = self.parse
            #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
            if goldData != None:
                EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml,
                                           goldData, edgeParse)
            else:
                EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml,
                                           self.classifyData, edgeParse)
        if self.checkStep("UNMERGING"):
            if self.model.getStr(
                    "unmerging-classifier-parameter", None
            ) != None:  #self.model.hasMember("unmerging-classifier-model"):
                #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz")
                # To avoid running out of memory, always use file on disk
                xml = self.getWorkFile(None,
                                       workOutputTag + "edge-pred.xml.gz")
                #goldData = None
                #if type(self.classifyData) in types.StringTypes:
                #    if os.path.exists(self.classifyData.replace("-nodup", "")):
                #        goldData = self.classifyData.replace("-nodup", "")
                xml = self.unmergingDetector.classifyToXML(xml,
                                                           self.model,
                                                           None,
                                                           workOutputTag,
                                                           goldData=goldData,
                                                           parse=self.parse)
                # Evaluate after unmerging
                if self.parse == None:
                    edgeParse = self.getStr(self.edgeDetector.tag + "parse",
                                            self.model)
                else:
                    edgeParse = self.parse
                if goldData != None:
                    EvaluateInteractionXML.run(self.edgeDetector.evaluator,
                                               xml, goldData, edgeParse)
                else:
                    EvaluateInteractionXML.run(self.edgeDetector.evaluator,
                                               xml, self.classifyData,
                                               edgeParse)
            else:
                print >> sys.stderr, "No model for unmerging"
        if self.checkStep("MODIFIERS"):
            if self.model.hasMember("modifier-classifier-model"):
                xml = self.getWorkFile(xml, [
                    workOutputTag + "unmerging-pred.xml.gz",
                    workOutputTag + "edge-pred.xml.gz"
                ])
                xml = self.modifierDetector.classifyToXML(xml,
                                                          self.model,
                                                          None,
                                                          workOutputTag,
                                                          goldData=goldData,
                                                          parse=self.parse)
            else:
                print >> sys.stderr, "No model for modifier detection"
#        if self.checkStep("VALIDATE"):
#            xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
#            self.structureAnalyzer.load(model)
#            self.structureAnalyzer.validate(xml)
#            ETUtils.write(xml, workOutputTag + "validate-pred.xml.gz")
        if self.checkStep("ST-CONVERT"):
            if stParams["convert"]:
                #xml = self.getWorkFile(xml, [workOutputTag + "validate-pred.xml.gz", workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
                xml = self.getWorkFile(xml, [
                    workOutputTag + "modifier-pred.xml.gz", workOutputTag +
                    "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"
                ])
                Utils.STFormat.ConvertXML.toSTFormat(
                    xml,
                    output + "-events.tar.gz",
                    outputTag=stParams["a2Tag"],
                    writeExtra=(stParams["scores"] == True))
                if stParams["evaluate"]:  #self.stEvaluator != None:
                    task = self.task
                    if task == None:
                        task = self.getStr(self.edgeDetector.tag + "task",
                                           self.model)
                    self.stEvaluator.evaluate(output + "-events.tar.gz", task)
            else:
                print >> sys.stderr, "No BioNLP shared task format conversion"
        finalXMLFile = self.getWorkFile(None, [
            workOutputTag + "modifier-pred.xml.gz", workOutputTag +
            "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"
        ])
        if finalXMLFile != None:
            shutil.copy2(finalXMLFile, output + "-pred.xml.gz")
        self.deleteTempWorkDir()
        self.exitState()
Example #24
0
 def trainUnmergingDetector(self):
     xml = None
     if not self.unmerging:
         print >> sys.stderr, "No unmerging"
     if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING",
                       self.unmerging) and self.unmerging:
         # Self-classified train data for unmerging
         if self.doUnmergingSelfTraining:
             # This allows limiting to a subcorpus
             triggerStyle = copy.copy(
                 Parameters.get(self.triggerExampleStyle))
             edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle))
             unmergingStyle = Parameters.get(self.unmergingExampleStyle)
             if "sentenceLimit" in unmergingStyle and unmergingStyle[
                     "sentenceLimit"]:
                 triggerStyle["sentenceLimit"] = unmergingStyle[
                     "sentenceLimit"]
                 edgeStyle["sentenceLimit"] = unmergingStyle[
                     "sentenceLimit"]
             # Build the examples
             xml = self.triggerDetector.classifyToXML(
                 self.trainData,
                 self.model,
                 None,
                 self.workDir + "unmerging-extra-",
                 exampleStyle=triggerStyle)  #, recallAdjust=0.5)
             xml = self.edgeDetector.classifyToXML(
                 xml,
                 self.model,
                 None,
                 self.workDir + "unmerging-extra-",
                 exampleStyle=edgeStyle)  #, recallAdjust=0.5)
             assert xml != None
             EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml,
                                        self.trainData, self.parse)
         else:
             print >> sys.stderr, "No self-training for unmerging"
     if self.checkStep("UNMERGING-EXAMPLES",
                       self.unmerging) and self.unmerging:
         # Unmerging example generation
         GOLD_TEST_FILE = self.optData.replace("-nodup", "")
         GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "")
         if self.doUnmergingSelfTraining:
             if xml == None:
                 xml = self.workDir + "unmerging-extra-edge-pred.xml.gz"
             self.unmergingDetector.buildExamples(
                 self.model, [
                     self.optData.replace("-nodup", ""),
                     [self.trainData.replace("-nodup", ""), xml]
                 ], [
                     self.workDir + "unmerging-opt-examples.gz",
                     self.workDir + "unmerging-train-examples.gz"
                 ], [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]],
                 exampleStyle=self.unmergingExampleStyle,
                 saveIdsToModel=True)
             xml = None
         else:
             self.unmergingDetector.buildExamples(
                 self.model, [
                     self.optData.replace("-nodup", ""),
                     self.trainData.replace("-nodup", "")
                 ], [
                     self.workDir + "unmerging-opt-examples.gz",
                     self.workDir + "unmerging-train-examples.gz"
                 ], [GOLD_TEST_FILE, GOLD_TRAIN_FILE],
                 exampleStyle=self.unmergingExampleStyle,
                 saveIdsToModel=True)
             xml = None
         #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True)
     if self.checkStep("BEGIN-UNMERGING-MODEL",
                       self.unmerging) and self.unmerging:
         self.unmergingDetector.beginModel(
             None, self.model, self.workDir + "unmerging-train-examples.gz",
             self.workDir + "unmerging-opt-examples.gz")
     if self.checkStep("END-UNMERGING-MODEL",
                       self.unmerging) and self.unmerging:
         self.unmergingDetector.endModel(
             None, self.model, self.workDir + "unmerging-opt-examples.gz")
         print >> sys.stderr, "Adding unmerging classifier model to test-set event model"
         if self.combinedModel != None:
             self.combinedModel.addStr(
                 "unmerging-example-style",
                 self.model.getStr("unmerging-example-style"))
             self.combinedModel.insert(
                 self.model.get("unmerging-ids.classes"),
                 "unmerging-ids.classes")
             self.combinedModel.insert(
                 self.model.get("unmerging-ids.features"),
                 "unmerging-ids.features")
             self.unmergingDetector.addClassifierModel(
                 self.combinedModel,
                 self.model.get("unmerging-classifier-model", True),
                 self.model.getStr("unmerging-classifier-parameter"))
             self.combinedModel.save()
Example #25
0
 def classify(self,
              data,
              model,
              output,
              parse=None,
              task=None,
              fromStep=None,
              toStep=None):
     BINARY_RECALL_MODE = False  # TODO: make a parameter
     xml = None
     self.initVariables(classifyData=data,
                        model=model,
                        xml=None,
                        task=task,
                        parse=parse)
     self.enterState(
         self.STATE_CLASSIFY,
         ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"],
         fromStep, toStep)
     #self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "RECALL-ADJUST", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep)
     self.model = self.openModel(self.model, "r")
     if self.checkStep("TRIGGERS"):
         xml = self.triggerDetector.classifyToXML(
             self.classifyData,
             self.model,
             None,
             output + "-",
             parse=self.parse,
             recallAdjust=float(
                 self.getStr("recallAdjustParameter", self.model)))
     if self.checkStep("EDGES"):
         xml = self.getWorkFile(xml, output + "-recall-adjusted.xml.gz")
         xml = self.edgeDetector.classifyToXML(xml,
                                               self.model,
                                               None,
                                               output + "-",
                                               parse=self.parse)
         assert xml != None
         if self.parse == None:
             edgeParse = self.getStr(self.edgeDetector.tag + "parse",
                                     self.model)
         else:
             edgeParse = self.parse
         #EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
         EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, None,
                                    edgeParse)
     if self.checkStep("UNMERGING"):
         if self.model.hasMember("unmerging-classifier-model"):
             #xml = self.getWorkFile(xml, output + "-edge-pred.xml.gz")
             # To avoid running out of memory, always use file on disk
             xml = self.getWorkFile(None, output + "-edge-pred.xml.gz")
             goldData = None
             if type(self.classifyData) in types.StringTypes:
                 if os.path.exists(self.classifyData.replace("-nodup", "")):
                     goldData = self.classifyData.replace("-nodup", "")
             xml = self.unmergingDetector.classifyToXML(xml,
                                                        self.model,
                                                        None,
                                                        output + "-",
                                                        goldData=goldData,
                                                        parse=self.parse)
         else:
             print >> sys.stderr, "No model for unmerging"
     if self.checkStep("MODIFIERS"):
         if self.model.hasMember("modifier-classifier-model"):
             xml = self.getWorkFile(xml, [
                 output + "-unmerging-pred.xml.gz",
                 output + "-edge-pred.xml.gz"
             ])
             xml = self.modifierDetector.classifyToXML(xml,
                                                       self.model,
                                                       None,
                                                       output + "-",
                                                       parse=self.parse)
         else:
             print >> sys.stderr, "No model for modifier detection"
     if self.checkStep("ST-CONVERT"):
         xml = self.getWorkFile(xml, [
             output + "-modifier-pred.xml.gz",
             output + "-unmerging-pred.xml.gz", output + "-edge-pred.xml.gz"
         ])
         STFormat.ConvertXML.toSTFormat(xml,
                                        output + "-events.tar.gz",
                                        outputTag="a2",
                                        writeScores=self.stWriteScores)
         if self.stEvaluator != None:
             task = self.task
             if task == None:
                 task = self.getStr(self.edgeDetector.tag + "task",
                                    self.model)
             self.stEvaluator.evaluate(output + "-events.tar.gz", task)
     self.exitState()
Example #26
0
    def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None):
        # example directionality
        if self.styles.get("directed") == None and self.styles.get("undirected") == None: # determine directedness from corpus
            examplesAreDirected = self.structureAnalyzer.hasDirectedTargets() if self.structureAnalyzer != None else True
        elif self.styles.get("directed"):
            assert self.styles.get("undirected") in [None, False]
            examplesAreDirected = True
        elif self.styles.get("undirected"):
            assert self.styles.get("directed") in [None, False]
            examplesAreDirected = False
            
        # Filter entities, if needed
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        #entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        entityToGold = None
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
#         paths = None
#         if not self.styles.get("no_path"):
#             undirected = sentenceGraph.dependencyGraph.toUndirected()
#             paths = undirected
#             if self.styles.get("filter_shortest_path") != None: # For DDI use filter_shortest_path=conj_and
#                 paths.resetAnalyses() # just in case
#                 paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]})

        dg = sentenceGraph.dependencyGraph
        undirected = dg.toUndirected()
        edgeCounts = {x:len(dg.getInEdges(x) + dg.getOutEdges(x)) for x in sentenceGraph.tokens}
        
        tokens, tokenMap = self.getTokenFeatures(sentenceGraph)
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles.get("token_nodes"):
            loopRange = len(tokens)
        else:
            loopRange = len(entities)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles.get("token_nodes"):
                    tI = tokens[i]["element"]
                    tJ = tokens[j]["element"]
                else:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles.get("skip_extra_triggers"):
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles.get("headsOnly"):
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                if examplesAreDirected:
                    self.buildExample(examples, tI, tJ, eI, eJ, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts)
                    self.buildExample(examples, tJ, tI, eJ, eI, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts)
                else:
                    if tokenMap[tJ]["index"] < tokenMap[tI]["index"]:
                        tI, tJ = tJ, tI
                        eI, eJ = eJ, eI
                    self.buildExample(examples, tI, tJ, eI, eJ, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts, False)
Example #27
0
#Gazetteer.run(TRAIN_FILE, "gazetteer-train")
#GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids")

GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK,
                                PARSE_TOK, "style:typed", "trigger-ids")
Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000",
         "trigger-test-classifications")
evaluator = Ev.evaluate("trigger-test-examples",
                        "trigger-test-classifications",
                        "trigger-ids.class_names")

#evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\
#    "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0]

ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE,
                                   "test-predicted-triggers.xml",
                                   "trigger-ids.class_names", PARSE_TOK,
                                   PARSE_TOK)

# RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml")
# ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml")
# ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True)
# EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)

ix.splitMergedElements("test-predicted-triggers.xml",
                       "test-predicted-triggers-split.xml")
ix.recalculateIds("test-predicted-triggers-split.xml",
                  "test-predicted-triggers-split-recids.xml", True)
EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml",
                           GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)