Esempio n. 1
0
 def classify(self, examples, parameters=None):
     if type(examples) == types.StringType:
         testFilePath = examples
         predictions = []
         realClasses = []
         exampleFile = open(examples,"rt")
         for line in exampleFile.readlines():
             realClasses.append(int(line.split(" ",1)[0].strip()))
         exampleFile.close()
     elif type(examples) == types.ListType:
         examples, predictions = self.filterClassificationSet(examples, True)
         Example.writeExamples(examples, self.tempDir+"/test.dat")
         testFilePath = self.tempDir+"/test.dat"
     args = [self.classifyBin]
     if parameters != None:
         self.__addParametersToSubprocessCall(args, parameters)
     args += [testFilePath, self.tempDir+"/model", self.tempDir+"/predictions"]
     #print args
     subprocess.call(args, stdout = self.debugFile)
     os.remove(self.tempDir+"/model")
     predictionsFile = open(self.tempDir+"/predictions", "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     #predictions = []
     for i in range(len(lines)):
         if type(examples) == types.ListType:
             predictions.append( (examples[i],float(lines[i]),self.type,lines[i]) )
         else:
             predictions.append( ([None,realClasses[i]],float(lines[i]),self.type) )
     return predictions
    def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0], goldSentence[0], append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >>sys.stderr, "Examples built:", exampleCount
        print >>sys.stderr, "Features:", len(self.featureSet.getNames())
        # IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        # ENDIF
        # Save Ids
        if idFileTag != None:
            print >>sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Esempio n. 3
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()
        
        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
Esempio n. 5
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(
            SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse,
            SharedTaskEvaluator.tokenization)
        # Build interaction xml
        xml = BioTextExampleWriter.write(
            examples, predictions, SharedTaskEvaluator.corpusElements, None,
            SharedTaskEvaluator.ids + ".class_names",
            SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        # Convert to GENIA format
        gifxmlToGenia(xml,
                      SharedTaskEvaluator.geniaDir,
                      task=SharedTaskEvaluator.task,
                      verbose=False)
        # Use GENIA evaluation tool
        self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir,
                                          task=SharedTaskEvaluator.task,
                                          evaluations=["approximate"],
                                          verbose=False)
Esempio n. 6
0
 def train(cls, examples, parameters, outputFile=None): #, timeout=None):
     """
     Train the SVM-multiclass classifier on a set of examples.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type outputFile: string
     @param outputFile: the name of the model file to be written
     """
     timer = Timer()
     parameters = cls.getParams(parameters)
     
     # If examples are in a list, they will be written to a file for SVM-multiclass
     if type(examples) == types.ListType:
         print >> sys.stderr, "Training SVM-MultiClass on", len(examples), "examples"
         trainPath = self.tempDir+"/train.dat"
         examples = self.filterTrainingSet(examples)
         Example.writeExamples(examples, trainPath)
     else:
         print >> sys.stderr, "Training SVM-MultiClass on file", examples
         trainPath = cls.stripComments(examples)
     args = ["/home/jari/Programs/liblinear-1.5-poly2/train"]
     cls.__addParametersToSubprocessCall(args, parameters)
     if outputFile == None:
         args += [trainPath, "model"]
         logFile = open("svmmulticlass.log","at")
     else:
         args += [trainPath, outputFile]
         logFile = open(outputFile+".log","wt")
     rv = subprocess.call(args, stdout = logFile)
     logFile.close()
     print >> sys.stderr, timer.toString()
     return rv
Esempio n. 7
0
 def train(self, examples, parameters=None):
     self.isBinary = self.isBinaryProblem(examples)
     examples = self.filterTrainingSet(examples)
     ExampleUtils.writeExamples(examples, self.tempDir + "/train.dat")
     #prepare parameters:
     if parameters.has_key("c"):
         assert (not parameters.has_key("C"))
         parameters["C"] = parameters["c"]
         del parameters["c"]
     totalExamples = float(sum(self.classes.values()))
     weight_label = self.classes.keys()
     weight_label.sort()
     weight = []
     for k in weight_label:
         weight.append(1.0 - self.classes[k] / totalExamples)
     libSVMparam = svm.svm_parameter(nr_weight=len(self.classes),
                                     weight_label=weight_label,
                                     weight=weight,
                                     **parameters)
     labels = []
     samples = []
     for example in examples:
         labels.append(example[1])
         samples.append(example[2])
     problem = svm.svm_problem(labels, samples)
     self.model = svm.svm_model(problem, libSVMparam)
Esempio n. 8
0
    def __init__(self, examples=None, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"])

        self.classSet = classSet
        self.results = None
        self.internal = None
        if predictions != None:
            for example in examples:
                if example[3] != None:
                    print >> sys.stderr, "ChemProt Evaluator:"
                    self._calculateExamples(examples, predictions)
                else:
                    print >> sys.stderr, "No example extra info, skipping ChemProt evaluation"
                break
            self.internal = AveragingMultiClassEvaluator(
                examples, predictions, classSet)
            print >> sys.stderr, "AveragingMultiClassEvaluator:"
            print >> sys.stderr, self.internal.toStringConcise()
    def buildExamplesForDocuments(self,
                                  documentSentences,
                                  output,
                                  idFileTag=None):
        examples = []
        counter = ProgressCounter(len(documentSentences), "Build examples")

        #calculatePredictedRange(self, sentences)

        outfile = open(output, "wt")
        exampleCount = 0
        for document in documentSentences:
            counter.update(
                1,
                "Building examples (" + document[0].sentence.get("id") + "): ")
            examples = self.buildExamples(document)
            exampleCount += len(examples)
            #examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Esempio n. 10
0
 def test(cls, examples, modelPath, output=None, parameters=None, timeout=None):
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with All-True Classifier"
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with All-True Classifier"
         testPath = examples
         examples = Example.readExamples(examples,False)
     print >> sys.stderr, "Note! Classification must be binary"
     #examples, predictions = self.filterClassificationSet(examples, True)
     predictions = []
     for example in examples:
         #predictions.append( (example, example[1]) )
         predictions.append( [2] ) #[example[1]] )
     
     if output == None:
         output = "predictions"
     f = open(output, "wt")
     for p in predictions:
         f.write(str(p[0])+"\n")
     f.close()
         
     return predictions
Esempio n. 11
0
    def buildGraphKernelFeatures(self, sentenceGraph, path):
        edgeList = []
        depGraph = sentenceGraph.dependencyGraph
        pt = path
        for i in range(1, len(path)):
            edgeList.extend(depGraph.getEdges(pt[i], pt[i - 1]))
            edgeList.extend(depGraph.getEdges(pt[i - 1], pt[i]))
        edges = edgeList
        adjacencyMatrix, labels = self._buildAdjacencyMatrix(
            sentenceGraph, path, edges)
        node_count = 2 * len(sentenceGraph.tokens) + len(
            sentenceGraph.dependencies)

        if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0":
            adjacencyMatrixToHtml(adjacencyMatrix, labels,
                                  "LLL.d0.s0_adjacency_matrix.html")

        allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count)
        self._matrixToFeatures(allPathsMatrix, labels)
        if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0":
            adjacencyMatrixToHtml(allPathsMatrix, labels,
                                  "LLL.d0.s0_all_paths_matrix.html")
            commentLines = []
            commentLines.extend(self.featureSet.toStrings())
            example = [
                "example_" + self.entity1.attrib["id"] + "_" +
                self.entity2.attrib["id"], "unknown", self.features
            ]
            ExampleUtils.writeExamples([example], "LLL.d0.s0_example.txt",
                                       commentLines)
Esempio n. 12
0
    def test(cls,
             examples,
             modelPath,
             output=None,
             parameters=None,
             timeout=None):
        if type(examples) == types.ListType:
            print >> sys.stderr, "Classifying", len(
                examples), "with All-Correct Classifier"
            examples, predictions = self.filterClassificationSet(
                examples, False)
            testPath = self.tempDir + "/test.dat"
            Example.writeExamples(examples, testPath)
        else:
            print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier"
            testPath = examples
            examples = Example.readExamples(examples, False)
        #examples, predictions = self.filterClassificationSet(examples, True)
        predictions = []
        for example in examples:
            #predictions.append( (example, example[1]) )
            predictions.append([example[1]])

        if output == None:
            output = "predictions"
        f = open(output, "wt")
        for p in predictions:
            f.write(str(p[0]) + "\n")
        f.close()

        return predictions
Esempio n. 13
0
 def loadExamples(self, examples, predictions):
     if type(predictions) == types.StringType:
         print >> sys.stderr, "Reading predictions from", predictions
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:
         print >> sys.stderr, "Reading examples from", examples
         examples = ExampleUtils.readExamples(examples, False)
     return examples, predictions
Esempio n. 14
0
 def loadExamples(self, examples, predictions):
     if type(predictions) == types.StringType:
         print >> sys.stderr, "Reading predictions from", predictions
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:
         print >> sys.stderr, "Reading examples from", examples
         examples = ExampleUtils.readExamples(examples, False)
     return examples, predictions
Esempio n. 15
0
 def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if forceInternal or Settings.SVMMultiClassDir == None:
         return cls.testInternal(examples, modelPath, output)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = cls.stripComments(examples)
         examples = Example.readExamples(examples,False)
     args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"]
     if modelPath == None:
         modelPath = "model"
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],"classifier/model")
             del parameters["predefined"]
         self.__addParametersToSubprocessCall(args, parameters)
     if output == None:
         output = "predictions"
         logFile = open("svmmulticlass.log","at")
     else:
         logFile = open(output+".log","wt")
     args += [testPath, modelPath, output]
     #if timeout == None:
     #    timeout = -1
     #print args
     subprocess.call(args, stdout = logFile, stderr = logFile)
     predictionsFile = open(output, "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     predictions = []
     for i in range(len(lines)):
         predictions.append( [int(lines[i].split()[0])] + lines[i].split()[1:] )
         #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) )
     print >> sys.stderr, timer.toString()
     return predictions
    def preProcessExamples(self, allExamples):
        # Duplicates cannot be removed here, as they should only be removed from the training set. This is done
        # in the classifier.
#        if "no_duplicates" in self.styles:
#            count = len(allExamples)
#            print >> sys.stderr, " Removing duplicates,", 
#            allExamples = ExampleUtils.removeDuplicates(allExamples)
#            print >> sys.stderr, "removed", count - len(allExamples)
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples   
Esempio n. 17
0
 def classify(self, examples, parameters=None):
     examples, predictions = self.filterClassificationSet(
         examples, self.isBinary)
     ExampleUtils.writeExamples(examples, self.tempDir + "/test.dat")
     for i in range(len(examples)):
         if self.isBinary:
             predictedClass = self.model.predict(examples[i][2])
             predictions.append((examples[i], predictedClass, "binary"))
         else:
             predictedClass = self.model.predict(examples[i][2])
             predictions.append((examples[i], predictedClass, "multiclass"))
     return predictions
Esempio n. 18
0
 def preProcessExamples(self, allExamples):
     # Duplicates cannot be removed here, as they should only be removed from the training set. This is done
     # in the classifier.
     #        if "no_duplicates" in self.styles:
     #            count = len(allExamples)
     #            print >> sys.stderr, " Removing duplicates,",
     #            allExamples = ExampleUtils.removeDuplicates(allExamples)
     #            print >> sys.stderr, "removed", count - len(allExamples)
     if "normalize" in self.styles:
         print >> sys.stderr, " Normalizing feature vectors"
         ExampleUtils.normalizeFeatureVectors(allExamples)
     return allExamples
Esempio n. 19
0
 def classify(self, examples, parameters=None):
     examples, predictions = self.filterClassificationSet(examples, self.isBinary)
     ExampleUtils.writeExamples(examples, self.tempDir+"/test.dat")
     for i in range(len(examples)):
         if self.isBinary:
             predictedClass = self.model.predict(examples[i][2])
             predictions.append( (examples[i],predictedClass,"binary") )
         else:
             predictedClass = self.model.predict(examples[i][2])
             predictions.append( (examples[i],predictedClass,"multiclass") )
     return predictions
         
         
 def classify(self,
              examples,
              output,
              model=None,
              finishBeforeReturn=False,
              replaceRemoteFiles=True):
     output = os.path.abspath(output)
     # Get examples
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(
             examples), "with All-Correct Classifier"
     else:
         print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier"
         examples = self.getExampleFile(examples,
                                        upload=False,
                                        replaceRemote=False,
                                        dummy=False)
         examples = Example.readExamples(examples, False)
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     # Classify
     f = open(output, "wt")
     for example in examples:
         f.write(str(example[1]) + "\n")
     f.close()
     classifier.predictions = output
     return classifier
Esempio n. 21
0
    def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None):
        model = self.openModel(model, "r")
        if parse == None:
            parse = self.getStr(self.tag+"parse", model)
        if exampleFileName == None:
            exampleFileName = tag+self.tag+"examples"
            if compressExamples:
                exampleFileName += ".gz"
        self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle)
        if classifierModel == None:
            classifierModel = model.get(self.tag+"classifier-model", defaultIfNotExist=None)
        #else:
        #    assert os.path.exists(classifierModel), classifierModel
        classifier = self.getClassifier(model.getStr(self.tag+"classifier-parameter", defaultIfNotExist=None))()
        classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True)
        threshold = model.getStr(self.tag+"threshold", defaultIfNotExist=None, asType=float)
        predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust, threshold=threshold)
        evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes"))
        #outputFileName = tag+"-"+self.tag+"pred.xml.gz"
        #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style"))
        if exampleStyle == None:
            exampleStyle = Parameters.get(model.getStr(self.tag+"example-style")) # no checking, but these should already have passed the ExampleBuilder
        self.structureAnalyzer.load(model)
        return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse, exampleStyle=exampleStyle, structureAnalyzer=self.structureAnalyzer)
#        if evaluator.getData().getTP() + evaluator.getData().getFP() > 0:
#            return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse)
#        else:
#            # TODO: e.g. interactions must be removed if task does unmerging
#            print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input"
#            if type(data) in types.StringTypes: # assume its a file
#                shutil.copy(data, outputFileName)
#            else: # assume its an elementtree
#                ETUtils.write(data, outputFileName)
#            #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written"
#            return data #None
Esempio n. 22
0
 def __init__(self, examples, predictions=None, classSet=None):
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     
     SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Build interaction xml
     xml = BioTextExampleWriter.write(examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids+".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Convert to GENIA format
     gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False)
     # Use GENIA evaluation tool
     self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
Esempio n. 23
0
    def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True):
        model = self.openModel(model, "r")
        if parse == None:
            parse = self.getStr(self.tag+"parse", model)
        if exampleFileName == None:
            exampleFileName = tag+self.tag+"examples"
            if compressExamples:
                exampleFileName += ".gz"
            self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse)
        if classifierModel == None:
            classifierModel = model.get(self.tag+"classifier-model")
        else:
            assert os.path.exists(classifierModel), classifierModel
        classifier = self.Classifier()
        classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True)
        predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust)
        evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes"))
        #outputFileName = tag+"-"+self.tag+"pred.xml.gz"
        return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse)
#        if evaluator.getData().getTP() + evaluator.getData().getFP() > 0:
#            return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse)
#        else:
#            # TODO: e.g. interactions must be removed if task does unmerging
#            print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input"
#            if type(data) in types.StringTypes: # assume its a file
#                shutil.copy(data, outputFileName)
#            else: # assume its an elementtree
#                ETUtils.write(data, outputFileName)
#            #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written"
#            return data #None
Esempio n. 24
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        self.dataByClass = defaultdict(EvaluationData)

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        self.dataByClass = defaultdict(EvaluationData)
        
        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
Esempio n. 26
0
 def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False):
     assert step in ["BOTH", "SUBMIT", "RESULTS"], step
     outDir = os.path.abspath(outDir)
     # Initialize training (or reconnect to existing jobs)
     combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters)
     trained = []
     for combination in combinations:
         trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) )
     if step == "SUBMIT": # Return already
         classifier = copy.copy(self)
         classifier.setState("OPTIMIZE")
         return classifier
     
     # Wait for the training to finish
     finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained])
     # Evaluate the results
     print >> sys.stderr, "Evaluating results"
     #Stream.setIndent(" ")
     bestResult = None
     if evaluator == None:
         evaluator = self.defaultEvaluator
     for i in range(len(combinations)):
         id = trained[i].parameterIdStr
         #Stream.setIndent(" ")
         # Get predictions
         predictions = None
         if trained[i].getStatus() == "FINISHED":
             predictions = trained[i].downloadPredictions()
         else:
             print >> sys.stderr, "No results for combination" + id
             continue
         if downloadAllModels:
             trained[i].downloadModel()
         # Compare to other results
         print >> sys.stderr, "*** Evaluating results for combination" + id + " ***"
         threshold = None
         if determineThreshold:
             print >> sys.stderr, "Thresholding, original micro =",
             evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False)
             print >> sys.stderr, evaluation.microF.toStringConcise()
             threshold, bestF = evaluator.threshold(classifyExamples, predictions)
             print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6]
         evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv"))
         if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore:
             bestResult = [evaluation, trained[i], combinations[i], threshold]
         if not self.connection.isLocal():
             os.remove(predictions) # remove predictions to save space
     #Stream.setIndent()
     if bestResult == None:
         raise Exception("No results for any parameter combination")
     print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***"
     print >> sys.stderr, "Selected parameters", bestResult[2]
     classifier = copy.copy(bestResult[1])
     classifier.threshold = bestResult[3]
     classifier.downloadModel()
     return classifier
Esempio n. 27
0
 def __init__(self, examples, predictions=None, classSet=None):
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     
     corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization)
     # Build interaction xml
     xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization)
     xml = ix.splitMergedElements(xml, None)
     xml = ix.recalculateIds(xml, None, True)
     #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Convert to GENIA format
     STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2")
     #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False)
     # Use GENIA evaluation tool
     self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag)
     corpusElements = None
Esempio n. 28
0
 def __init__(self, examples=None, predictions=None, classSet=None):
     if type(classSet) == types.StringType:  # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType:  # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:  # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     #self.examples = examples
     #self.predictions = predictions
     self.truePositives = 0
     self.falsePositives = 0
     self.trueNegatives = 0
     self.falseNegatives = 0
     self.precision = None
     self.recall = None
     self.fScore = None
     self.AUC = None
     self.type = "binary"
     if predictions != None:
         self._calculate(examples, predictions)
Esempio n. 29
0
 def __init__(self, examples=None, predictions=None, classSet=None):
     if type(classSet) == types.StringType:  # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType:  # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:  # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     # self.examples = examples
     # self.predictions = predictions
     self.truePositives = 0
     self.falsePositives = 0
     self.trueNegatives = 0
     self.falseNegatives = 0
     self.precision = None
     self.recall = None
     self.fScore = None
     self.AUC = None
     self.type = "binary"
     if predictions != None:
         self._calculate(examples, predictions)
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0

    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)

    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId

    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example (" + example[0] + "): ")
        features = example[2]
        for i in range(len(weightFeatures) - 1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
Esempio n. 31
0
    def buildExamplesForSentences(self,
                                  sentences,
                                  goldSentences,
                                  output,
                                  idFileTag=None,
                                  append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(
                1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0],
                                          goldSentence[0],
                                          append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0
    
    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)
    
    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId
    
    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example ("+example[0]+"): ")
        features = example[2]
        for i in range(len(weightFeatures)-1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
Esempio n. 33
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
Esempio n. 34
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
    def threshold(cls, examples, predictions):
        # Make negative confidence score / true class pairs
        if type(examples) in types.StringTypes:
            examples = ExampleUtils.readExamples(examples, False)
        if type(predictions) in types.StringTypes:
            predictions = ExampleUtils.loadPredictions(predictions)
        pairs = []
        realPositives = 0
        for example, prediction in itertools.izip(examples, predictions):
            trueClass = example[1]
            assert (trueClass > 0
                    )  # multiclass classification uses non-negative integers
            if trueClass > 1:
                realPositives += 1
            negClassValue = prediction[1]
            pairs.append((negClassValue, trueClass))
        pairs.sort(reverse=True)
        realNegatives = len(pairs) - realPositives

        # When starting thresholding, all examples are considered positive
        binaryF = EvaluationData()
        binaryF._tp = realPositives
        binaryF._fp = realNegatives
        binaryF._fn = 0
        binaryF.calculateFScore()
        fscore = binaryF.fscore
        threshold = pairs[0][0] - 1.

        # Turn one example negative at a time
        for pair in pairs:
            if pair[1] == 1:  # the real class is negative
                binaryF._fp -= 1  # false positive -> true negative
            else:  # the real class is a positive class
                binaryF._tp -= 1  # true positive -> ...
                binaryF._fn += 1  # ... false negative
            binaryF.calculateFScore()
            if binaryF.fscore > fscore:
                fscore = binaryF.fscore
                threshold = pair[0] + 0.00000001
        return threshold, fscore
Esempio n. 36
0
    def train(self, examples, parameters=None, outputDir=None):
        timeout = -1
        if type(examples) == types.StringType:
            trainFilePath = examples
        elif type(examples) == types.ListType:
            examples = self.filterTrainingSet(examples)
            parameters = copy.copy(parameters)
            if parameters.has_key("style"):
                if "no_duplicates" in parameters["style"]:
                    examples = Example.removeDuplicates(examples)
                del parameters["style"]
            Example.writeExamples(examples, self.tempDir+"/train.dat")
            trainFilePath = self.tempDir+"/train.dat"

        if parameters.has_key("timeout"):
            timeout = parameters["timeout"]
            del parameters["timeout"]        
        args = [self.trainBin]
        if parameters != None:
            self.__addParametersToSubprocessCall(args, parameters)
        args += [trainFilePath, self.tempDir+"/model"]
        return killableprocess.call(args, stdout = self.debugFile, timeout = timeout)
 def buildGraphKernelFeatures(self, sentenceGraph, path):
     edgeList = []
     depGraph = sentenceGraph.dependencyGraph
     pt = path
     for i in range(1, len(path)):
         edgeList.extend(depGraph.getEdges(pt[i], pt[i-1]))
         edgeList.extend(depGraph.getEdges(pt[i-1], pt[i]))
     edges = edgeList
     adjacencyMatrix, labels = self._buildAdjacencyMatrix(sentenceGraph, path, edges)
     node_count = 2*len(sentenceGraph.tokens) + len(sentenceGraph.dependencies)
     
     if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0":
         adjacencyMatrixToHtml(adjacencyMatrix, labels, "LLL.d0.s0_adjacency_matrix.html")
     
     allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count)
     self._matrixToFeatures(allPathsMatrix, labels)
     if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0":
         adjacencyMatrixToHtml(allPathsMatrix, labels, "LLL.d0.s0_all_paths_matrix.html")
         commentLines = []
         commentLines.extend(self.featureSet.toStrings())
         example = ["example_"+self.entity1.attrib["id"]+"_"+self.entity2.attrib["id"],"unknown",self.features]
         ExampleUtils.writeExamples([example],"LLL.d0.s0_example.txt",commentLines)
Esempio n. 38
0
 def threshold(cls, examples, predictions):
     # Make negative confidence score / true class pairs
     if type(examples) in types.StringTypes:
         examples = ExampleUtils.readExamples(examples, False)
     if type(predictions) in types.StringTypes:
         predictions = ExampleUtils.loadPredictions(predictions)
     pairs = []
     realPositives = 0
     for example, prediction in itertools.izip(examples, predictions):
         trueClass = example[1]
         assert(trueClass > 0) # multiclass classification uses non-negative integers
         if trueClass > 1:
             realPositives += 1
         negClassValue = prediction[1]
         pairs.append( (negClassValue, trueClass) )
     pairs.sort(reverse=True)
     realNegatives = len(pairs) - realPositives
     
     # When starting thresholding, all examples are considered positive
     binaryF = EvaluationData()
     binaryF._tp = realPositives
     binaryF._fp = realNegatives
     binaryF._fn = 0
     binaryF.calculateFScore()
     fscore = binaryF.fscore
     threshold = pairs[0][0]-1.
     
     # Turn one example negative at a time
     for pair in pairs:
         if pair[1] == 1: # the real class is negative
             binaryF._fp -= 1 # false positive -> true negative
         else: # the real class is a positive class
             binaryF._tp -= 1 # true positive -> ...
             binaryF._fn += 1 # ... false negative
         binaryF.calculateFScore()
         if binaryF.fscore > fscore:
             fscore = binaryF.fscore
             threshold = pair[0]+0.00000001
     return threshold, fscore        
Esempio n. 39
0
    def classifyToXML(self,
                      data,
                      model,
                      exampleFileName=None,
                      tag="",
                      classifierModel=None,
                      goldData=None,
                      parse=None,
                      recallAdjust=None,
                      compressExamples=True):
        model = self.openModel(model, "r")
        if parse == None:
            parse = self.getStr(self.tag + "parse", model)
        if exampleFileName == None:
            exampleFileName = tag + self.tag + "examples"
            if compressExamples:
                exampleFileName += ".gz"
            self.buildExamples(model, [data], [exampleFileName], [goldData],
                               parse=parse)
        if classifierModel == None:
            classifierModel = model.get(self.tag + "classifier-model")
        else:
            assert os.path.exists(classifierModel), classifierModel
        classifier = self.Classifier()
        classifier.classify(exampleFileName,
                            tag + self.tag + "classifications",
                            classifierModel,
                            finishBeforeReturn=True)
        predictions = ExampleUtils.loadPredictions(
            tag + self.tag + "classifications", recallAdjust)
        evaluator = self.evaluator.evaluate(
            exampleFileName, predictions, model.get(self.tag + "ids.classes"))
        #outputFileName = tag+"-"+self.tag+"pred.xml.gz"
        return self.exampleWriter.write(exampleFileName, predictions, data,
                                        tag + self.tag + "pred.xml.gz",
                                        model.get(self.tag + "ids.classes"),
                                        parse)


#        if evaluator.getData().getTP() + evaluator.getData().getFP() > 0:
#            return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse)
#        else:
#            # TODO: e.g. interactions must be removed if task does unmerging
#            print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input"
#            if type(data) in types.StringTypes: # assume its a file
#                shutil.copy(data, outputFileName)
#            else: # assume its an elementtree
#                ETUtils.write(data, outputFileName)
#            #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written"
#            return data #None
Esempio n. 40
0
 def train(self, examples, parameters=None):
     self.isBinary = self.isBinaryProblem(examples)
     examples = self.filterTrainingSet(examples)
     ExampleUtils.writeExamples(examples, self.tempDir+"/train.dat")
     #prepare parameters:
     if parameters.has_key("c"):
         assert(not parameters.has_key("C"))
         parameters["C"] = parameters["c"]
         del parameters["c"]
     totalExamples = float(sum(self.classes.values()))
     weight_label = self.classes.keys()
     weight_label.sort()
     weight = []
     for k in weight_label:
         weight.append(1.0-self.classes[k]/totalExamples)
     libSVMparam = svm.svm_parameter(nr_weight = len(self.classes), weight_label=weight_label, weight=weight, **parameters)
     labels = []
     samples = []
     for example in examples:
         labels.append(example[1])
         samples.append(example[2])
     problem = svm.svm_problem(labels, samples)
     self.model = svm.svm_model(problem, libSVMparam)
Esempio n. 41
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")
    
    calculatePredictedRange(exampleBuilder, sentences)
    
    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
Esempio n. 42
0
    def write(
        cls,
        examples,
        predictions,
        corpus,
        outputFile,
        classSet=None,
        parse=None,
        tokenization=None,
        goldCorpus=None,
        insertWeights=False,
    ):
        if type(examples) == types.StringType:
            print >>sys.stderr, "Reading examples from", examples
            examples = ExampleUtils.readExamples(examples, False)

        # This looks a bit strange, but should work with the re-iterable
        # generators that readExamples returns
        xType = None
        for example in examples:
            assert example[3].has_key("xtype")
            xType = example[3]["xtype"]
            break

        if xType == "token":
            w = EntityExampleWriter()
            if insertWeights:
                w.insertWeights = True
        elif xType == "edge":
            w = EdgeExampleWriter()
        elif xType == "task3":
            w = ModifierExampleWriter()
        elif xType == "entRel":
            w = EntityRelationExampleWriter()
        elif xType == "phrase":
            w = PhraseTriggerExampleWriter()
        # IF LOCAL
        elif xType == "um":
            w = UnmergingExampleWriter()
        # elif xType == "ue":
        #    w = UnmergedEdgeExampleWriter()
        # elif xType == "asym":
        #    w = AsymmetricEventExampleWriter()
        # ENDIF
        else:
            assert False, ("Unknown entity type", xType)
        return w.writeXML(
            examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus
        )
Esempio n. 43
0
    def train(self, examples, parameters=None, outputDir=None):
        timeout = -1
        if type(examples) == types.StringType:
            trainFilePath = examples
        elif type(examples) == types.ListType:
            examples = self.filterTrainingSet(examples)
            parameters = copy.copy(parameters)
            if parameters.has_key("style"):
                if "no_duplicates" in parameters["style"]:
                    examples = Example.removeDuplicates(examples)
                del parameters["style"]
            Example.writeExamples(examples, self.tempDir + "/train.dat")
            trainFilePath = self.tempDir + "/train.dat"

        if parameters.has_key("timeout"):
            timeout = parameters["timeout"]
            del parameters["timeout"]
        args = [self.trainBin]
        if parameters != None:
            self.__addParametersToSubprocessCall(args, parameters)
        args += [trainFilePath, self.tempDir + "/model"]
        return killableprocess.call(args,
                                    stdout=self.debugFile,
                                    timeout=timeout)
Esempio n. 44
0
    def train(cls, examples, parameters, outputFile=None):  #, timeout=None):
        """
        Train the SVM-multiclass classifier on a set of examples.
        
        @type examples: string (filename) or list (or iterator) of examples
        @param examples: a list or file containing examples in SVM-format
        @type parameters: a dictionary or string
        @param parameters: parameters for the classifier
        @type outputFile: string
        @param outputFile: the name of the model file to be written
        """
        timer = Timer()
        parameters = cls.getParams(parameters)

        # If examples are in a list, they will be written to a file for SVM-multiclass
        if type(examples) == types.ListType:
            print >> sys.stderr, "Training SVM-MultiClass on", len(
                examples), "examples"
            trainPath = self.tempDir + "/train.dat"
            examples = self.filterTrainingSet(examples)
            Example.writeExamples(examples, trainPath)
        else:
            print >> sys.stderr, "Training SVM-MultiClass on file", examples
            trainPath = cls.stripComments(examples)
        args = ["/home/jari/Programs/liblinear-1.5-poly2/train"]
        cls.__addParametersToSubprocessCall(args, parameters)
        if outputFile == None:
            args += [trainPath, "model"]
            logFile = open("svmmulticlass.log", "at")
        else:
            args += [trainPath, outputFile]
            logFile = open(outputFile + ".log", "wt")
        rv = subprocess.call(args, stdout=logFile)
        logFile.close()
        print >> sys.stderr, timer.toString()
        return rv
Esempio n. 45
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")

    calculatePredictedRange(exampleBuilder, sentences)

    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(
            1, "Building examples (" + sentence[0].getSentenceId() + "): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
Esempio n. 46
0
 def classify(self, examples, parameters=None):
     if type(examples) == types.StringType:
         testFilePath = examples
         predictions = []
         realClasses = []
         exampleFile = open(examples, "rt")
         for line in exampleFile.readlines():
             realClasses.append(int(line.split(" ", 1)[0].strip()))
         exampleFile.close()
     elif type(examples) == types.ListType:
         examples, predictions = self.filterClassificationSet(
             examples, True)
         Example.writeExamples(examples, self.tempDir + "/test.dat")
         testFilePath = self.tempDir + "/test.dat"
     args = [self.classifyBin]
     if parameters != None:
         self.__addParametersToSubprocessCall(args, parameters)
     args += [
         testFilePath, self.tempDir + "/model",
         self.tempDir + "/predictions"
     ]
     #print args
     subprocess.call(args, stdout=self.debugFile)
     os.remove(self.tempDir + "/model")
     predictionsFile = open(self.tempDir + "/predictions", "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     #predictions = []
     for i in range(len(lines)):
         if type(examples) == types.ListType:
             predictions.append(
                 (examples[i], float(lines[i]), self.type, lines[i]))
         else:
             predictions.append(
                 ([None, realClasses[i]], float(lines[i]), self.type))
     return predictions
Esempio n. 47
0
    def __init__(self, examples=None, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)
        
        self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"])

        self.classSet = classSet
        self.results = None
        self.internal = None
        if predictions != None:
            for example in examples:
                if example[3] != None:
                    print >> sys.stderr, "ChemProt Evaluator:"
                    self._calculateExamples(examples, predictions)
                else:
                    print >> sys.stderr, "No example extra info, skipping ChemProt evaluation"
                break
            self.internal = AveragingMultiClassEvaluator(examples, predictions, classSet)
            print >> sys.stderr, "AveragingMultiClassEvaluator:"
            print >> sys.stderr, self.internal.toStringConcise()
Esempio n. 48
0
 def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True):
     output = os.path.abspath(output)
     # Get examples
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with All-Correct Classifier"
     else:
         print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier"
         examples = self.getExampleFile(examples, upload=False, replaceRemote=False, dummy=False)
         examples = Example.readExamples(examples, False)
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     # Classify
     f = open(output, "wt")
     for example in examples:
         f.write(str(example[1]) + "\n")
     f.close()
     classifier.predictions = output
     return classifier
Esempio n. 49
0
    def write(cls, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False):
        if type(examples) == types.StringType:
            print >> sys.stderr, "Reading examples from", examples
            examples = ExampleUtils.readExamples(examples, False)
        
        # This looks a bit strange, but should work with the re-iterable
        # generators that readExamples returns
        xType = None
        for example in examples:
            assert example[3].has_key("xtype")
            xType = example[3]["xtype"]
            break
        
        if xType == "token":
            w = EntityExampleWriter()
            if insertWeights:
                w.insertWeights = True
        elif xType == "edge":
            w = EdgeExampleWriter()
        elif xType == "task3":
            w = ModifierExampleWriter()
        elif xType == "entRel":
            w = EntityRelationExampleWriter()
        elif xType == "phrase":
            w = PhraseTriggerExampleWriter()
#IF LOCAL
        elif xType == "um":
            w = UnmergingExampleWriter()
        #elif xType == "ue":
        #    w = UnmergedEdgeExampleWriter()
        #elif xType == "asym":
        #    w = AsymmetricEventExampleWriter()
#ENDIF
        else:
            assert False, ("Unknown entity type", xType)
        return w.writeXML(examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus)
Esempio n. 50
0
 def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if type(parameters) == types.StringType:
         parameters = splitParameters(parameters)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = examples
         examples = Example.readExamples(examples,False)
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],"classifier/model")
             del parameters["predefined"]
     # Read model
     if modelPath == None:
         modelPath = "model-multilabel"
     classModels = {}
     if modelPath.endswith(".gz"):
         f = gzip.open(modelPath, "rt")
     else:
         f = open(modelPath, "rt")
     thresholds = {}
     for line in f:
         key, value, threshold = line.split()
         classModels[key] = value
         if threshold != "None":
             thresholds[key] = float(threshold)
         else:
             thresholds[key] = 0.0
     f.close()
     mergedPredictions = []
     if type(classIds) == types.StringType:
         classIds = IdSet(filename=classIds)
     #print classModels
     print "Thresholds", thresholds
     classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify"
     print parameters
     if "classifier" in parameters and "svmperf" in parameters["classifier"]:
         classifierBin = Settings.SVMPerfDir+"/svm_perf_classify"
         parameters = copy.copy(parameters)
         del parameters["classifier"]
     for className in classIds.getNames():
         if className != "neg" and not "---" in className:
             classId = classIds.getId(className)
             if thresholds[str(className)] != 0.0:
                 print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)]
             else:
                 print >> sys.stderr, "Classifying", className
             args = [classifierBin]
             #self.__addParametersToSubprocessCall(args, parameters)
             classOutput = "predictions" + ".cls-" + className
             logFile = open("svmmulticlass" + ".cls-" + className + ".log","at")
             args += [testPath, classModels[str(className)], classOutput]
             print args
             subprocess.call(args, stdout = logFile, stderr = logFile)
             cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)])
     print >> sys.stderr, timer.toString()
     
     predFileName = output
     f = open(predFileName, "wt")
     for mergedPred in mergedPredictions:
         if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
             mergedPred[0].remove("1")
         mergedPred[1] = str(mergedPred[1])
         mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
         f.write(" ".join(mergedPred) + "\n")
     f.close()
     
     return mergedPredictions
Esempio n. 51
0
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    
    defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples"))
    
    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names"))
    
    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
Esempio n. 52
0
    def classifyToXML(self,
                      data,
                      model,
                      exampleFileName=None,
                      tag="",
                      classifierModel=None,
                      goldData=None,
                      parse=None,
                      recallAdjust=None,
                      compressExamples=True,
                      exampleStyle=None,
                      useExistingExamples=False):
        model = self.openModel(model, "r")
        if parse == None:
            parse = self.getStr(self.tag + "parse", model)
        if useExistingExamples:
            assert exampleFileName != None
            assert os.path.exists(exampleFileName)
        if exampleFileName == None:
            exampleFileName = tag + self.tag + "examples"
            if compressExamples:
                exampleFileName += ".gz"
        if not useExistingExamples:
            self.buildExamples(model, [data], [exampleFileName], [goldData],
                               parse=parse,
                               exampleStyle=exampleStyle)
        if classifierModel == None:
            classifierModel = model.get(self.tag + "classifier-model",
                                        defaultIfNotExist=None)
        #else:
        #    assert os.path.exists(classifierModel), classifierModel
        classifier = self.getClassifier(
            model.getStr(self.tag + "classifier-parameter",
                         defaultIfNotExist=None))()
        classifier.classify(exampleFileName,
                            tag + self.tag + "classifications",
                            classifierModel,
                            finishBeforeReturn=True)
        threshold = model.getStr(self.tag + "threshold",
                                 defaultIfNotExist=None,
                                 asType=float)
        predictions = ExampleUtils.loadPredictions(tag + self.tag +
                                                   "classifications",
                                                   recallAdjust,
                                                   threshold=threshold)
        evaluator = self.evaluator.evaluate(
            exampleFileName, predictions, model.get(self.tag + "ids.classes"))
        #outputFileName = tag+"-"+self.tag+"pred.xml.gz"
        #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style"))
        if exampleStyle == None:
            exampleStyle = Parameters.get(
                model.getStr(self.tag + "example-style")
            )  # no checking, but these should already have passed the ExampleBuilder
        self.structureAnalyzer.load(model)
        return self.exampleWriter.write(
            exampleFileName,
            predictions,
            data,
            tag + self.tag + "pred.xml.gz",
            model.get(self.tag + "ids.classes"),
            parse,
            exampleStyle=exampleStyle,
            structureAnalyzer=self.structureAnalyzer)


#        if evaluator.getData().getTP() + evaluator.getData().getFP() > 0:
#            return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse)
#        else:
#            # TODO: e.g. interactions must be removed if task does unmerging
#            print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input"
#            if type(data) in types.StringTypes: # assume its a file
#                shutil.copy(data, outputFileName)
#            else: # assume its an elementtree
#                ETUtils.write(data, outputFileName)
#            #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written"
#            return data #None
Esempio n. 53
0
 def preProcessExamples(self, allExamples):
     if "normalize" in self.styles:
         print >> sys.stderr, " Normalizing feature vectors"
         ExampleUtils.normalizeFeatureVectors(allExamples)
     return allExamples   
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues):
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                else:
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"                
                    features = {}
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
Esempio n. 55
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        # example directionality
        if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus
            examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True
        elif self.styles["directed"]:
            assert self.styles["undirected"] in [None, False]
            examplesAreDirected = True
        elif self.styles["undirected"]:
            assert self.styles["directed"] in [None, False]
            examplesAreDirected = False
        
        if not self.styles["no_trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
#         if self.styles["sdb_merge"]:
#             self.determineNonOverlappingTypes(structureAnalyzer)
            
        # Filter entities, if needed
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        entityToGold = None
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            paths = undirected
            if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and
                paths.resetAnalyses() # just in case
                paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]})
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["token_nodes"]:
            loopRange = len(sentenceGraph.tokens)
        else:
            loopRange = len(entities)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["token_nodes"]:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                else:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected)
                for categoryName, features, extra in examples:
                    # make example
                    if self.styles["binary"]:
                        if categoryName != "neg":
                            category = 1
                        else:
                            category = -1
                        extra["categoryName"] = "i"
                    else:
                        category = self.classSet.getId(categoryName)
                    example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra]
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1

        return exampleIndex
Esempio n. 56
0
    def buildExamplesFromGraph(self,
                               sentenceGraph,
                               outfile,
                               goldGraph=None,
                               structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return 0  #[]

        #examples = []
        exampleIndex = 0

        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}

        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass(
                "GIVEN", "ENTITY"
        ):  # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]:  # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]:  # manually force the setting
            buildForNameless = False

        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "given"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless:  # no names, no need for triggers
                return 0  #[]

            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(
                    sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)

            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles[
                    "names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue

            tokenText = token.get("text").lower()
            #            if "stem_gazetteer" in self.styles:
            #                tokenText = PorterStemmer.stem(tokenText)
            #            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
            #                features = {}
            #                features[self.featureSet.getId("exclude_gazetteer")] = 1
            #                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
            #                if entityIds != None:
            #                    extra["goldIds"] = entityIds
            #                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            #                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
            #                exampleIndex += 1
            #                continue

            # FEATURES
            features = {}

            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1

            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_" + stringLower)] = 1
                features[self.featureSet.getId(
                    "substringstem_" + PorterStemmer.stem(stringLower))] = 1

            if not self.styles["no_context"]:
                # Linear order features
                for index in [-3, -2, -1, 1, 2, 3]:
                    if i + index > 0 and i + index < len(sentenceGraph.tokens):
                        self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                      str(index), features)

                # Linear n-grams
                if self.styles["linear_ngrams"]:
                    self.buildLinearNGram(max(0, i - 1), i, sentenceGraph,
                                          features)
                    self.buildLinearNGram(max(0, i - 2), i, sentenceGraph,
                                          features)

            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:3].lower())] = 1

            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            if not self.styles["no_context"]:
                t1InEdges = self.inEdgesByToken[token]
                for edge in t1InEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HIn_" +
                                                   edge[0].get("POS"))] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   edge[0].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[0])
                    features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1
                t1OutEdges = self.outEdgesByToken[token]
                for edge in t1OutEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HOut_" +
                                                   edge[1].get("POS"))] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   edge[1].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[1])
                    features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1

            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(
                    sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)

            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" +
                                                   str(index) + "_" +
                                                   normalizedText[:index +
                                                                  1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" +
                                                   str(index) + "_" +
                                                   normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)
                self.drugFeatureBuilder.setFeatureVector(None)

            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(
                    tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_" + wordNetFeature)] = 1
                #print

            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(
                    token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)

            if self.styles["ontobiotope_features"]:
                self.ontobiotopeFeatureBuilder.setFeatureVector(features)
                self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token)
                self.ontobiotopeFeatureBuilder.setFeatureVector(None)

            extra = {"xtype": "token", "t": token.get("id")}
            if self.styles["bb_features"]:
                extra[
                    "trigex"] = "bb"  # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi"  # Request trigger type unmerging
            if entityIds != None:
                extra[
                    "goldIds"] = entityIds  # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )

            if self.styles["bb_spans"]:
                for span in sentenceGraph.sentenceElement.iter("span"):
                    if span.get("headOffset") != token.get("charOffset"):
                        continue
                    #if span.get("source") != "spec":
                    #    continue
                    #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id")
                    features[self.featureSet.getId("span_found")] = 1
                    features[self.featureSet.getId(
                        "span_count")] = 1 + features.get(
                            self.featureSet.getId("span_count"), 0)
                    features[self.featureSet.getId("span_identifier" +
                                                   span.get("identifier"))] = 1
                    features[self.featureSet.getId("span_type" +
                                                   span.get("type"))] = 1
                    features[self.featureSet.getId("span_category" +
                                                   span.get("category"))] = 1
                    features[self.featureSet.getId("span_source" +
                                                   span.get("source"))] = 1

                    if "define_offset" in extra:
                        prevOffset = [
                            int(x) for x in extra["define_offset"].split("-")
                        ]
                        assert len(prevOffset) == 2
                        newOffset = [
                            int(x) for x in span.get("charOffset").split("-")
                        ]
                        assert len(newOffset) == 2
                        prevOffsetRange = abs(prevOffset[0] - prevOffset[1])
                        newOffsetRange = abs(newOffset[0] - newOffset[1])
                        if newOffsetRange > prevOffsetRange:
                            extra["define_offset"] = span.get("charOffset")
                    else:
                        extra["define_offset"] = span.get("charOffset")
                features[self.featureSet.getId("span_count_" + str(
                    features.get(self.featureSet.getId("span_count"), 0)))] = 1

            # chains
            if not self.styles["no_context"]:
                self.buildChains(token, sentenceGraph, features)

            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            if self.styles["wordvector"]:
                self.wordVectorFeatureBuilder.setFeatureVector(features)
                self.wordVectorFeatureBuilder.buildFeatures(token)
                self.wordVectorFeatureBuilder.setFeatureVector(None)

            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Esempio n. 57
0
def compareExamples(examples1, examples2, features1, features2=None):
    ExampleUtils.readExamples(examples1)
    exampleIter1 = ExampleUtils.readExamples(examples1)
    exampleIter2 = ExampleUtils.readExamples(examples2)
    features1 = IdSet(filename=features1)
    if features2 != None:
        features2 = IdSet(filename=features2)
    else:
        features2 = features1
    # Compare feature sets
    if set(features1.Ids.keys()) != set(features2.Ids.keys()):
        print "Feature sets differ"
    # Compare examples
    counter = ProgressCounter(step=1)
    for e1, e2 in itertools.izip(exampleIter1, exampleIter2):
        counter.update()
        assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2))
        if e1[1] != e2[1]:
            print "Class differs"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
        f1 = getFeatureNames(e1, features1)
        f2 = getFeatureNames(e2, features2)
        f1Set = set(f1)
        f2Set = set(f2)
        f1Only = f1Set.difference(f2Set)
        f2Only = f2Set.difference(f1Set)
        if len(f1Only) > 0 or len(f2Only) > 0:
            print "Features differ"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
            if len(f1Only) > 0:
                print "  E1-only features:", f1Only
            if len(f2Only) > 0:
                print "  E2-only features:", f2Only
        else:
            assert len(f1) == len(f2)
            fCount = 0
            differ = False
            for feature1, feature2 in zip(f1, f2):
                #f1Id = features1.getId(feature1, createIfNotExist=False)
                #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation":
                #    print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id]
                if feature1 != feature2:
                    if not differ:
                        print "Feature order differs for example", e1[0]
                        differ = True
                    print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ",
                else:
                    f1Id = features1.getId(feature1, createIfNotExist=False)
                    f2Id = features2.getId(feature2, createIfNotExist=False)
                    f1Value = e1[2][f1Id]
                    f2Value = e2[2][f2Id]
                    if f1Value != f2Value:
                        if not differ:
                            print "Feature values differ", e1[0]
                            differ = True
                        print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ",
                fCount += 1              
            if differ:
                print
    counter.endUpdate()
Esempio n. 58
0
 def test(cls,
          examples,
          modelPath,
          output=None,
          parameters=None,
          forceInternal=False,
          classIds=None):  # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     #if forceInternal or Settings.SVMMultiClassDir == None:
     #    return cls.testInternal(examples, modelPath, output)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(
             examples), "with SVM-Light model", modelPath
         examples, predictions = self.filterClassificationSet(
             examples, False)
         testPath = self.tempDir + "/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-Light model", modelPath
         testPath = examples
         #examples = Example.readExamples(examples,False)
     if os.environ.has_key("METAWRK"):
         args = [SVMMultiClassClassifier.louhiBinDir + "/svm_classify"]
     else:
         args = [self.binDir + "/svm_classify"]
     if modelPath == None:
         modelPath = "model"
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],
                                      "classifier/model")
             del parameters["predefined"]
         self.__addParametersToSubprocessCall(args, parameters)
     if output == None:
         output = "predictions"
         logFile = open("svmlight.log", "at")
     else:
         logFile = open(output + ".log", "wt")
     args += [testPath, modelPath, output]
     #if timeout == None:
     #    timeout = -1
     #print args
     subprocess.call(args, stdout=logFile, stderr=logFile)
     predictionsFile = open(output, "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     predictions = []
     for i in range(len(lines)):
         predictions.append([int(lines[i].split()[0])] +
                            lines[i].split()[1:])
         #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) )
     print >> sys.stderr, timer.toString()
     return predictions
Esempio n. 59
0
    def optimize(self,
                 examples,
                 outDir,
                 parameters,
                 classifyExamples,
                 classIds,
                 step="BOTH",
                 evaluator=None,
                 determineThreshold=False,
                 timeout=None,
                 downloadAllModels=False):
        assert step in ["BOTH", "SUBMIT", "RESULTS"], step
        outDir = os.path.abspath(outDir)
        # Initialize training (or reconnect to existing jobs)
        combinations = Parameters.getCombinations(
            Parameters.get(parameters, valueListKey="c")
        )  #Core.OptimizeParameters.getParameterCombinations(parameters)
        trained = []
        for combination in combinations:
            trained.append(
                self.train(examples,
                           outDir,
                           combination,
                           classifyExamples,
                           replaceRemoteExamples=(len(trained) == 0),
                           dummy=(step == "RESULTS")))
        if step == "SUBMIT":  # Return already
            classifier = copy.copy(self)
            classifier.setState("OPTIMIZE")
            return classifier

        # Wait for the training to finish
        finalJobStatus = self.connection.waitForJobs(
            [x.getJob() for x in trained])
        # Evaluate the results
        print >> sys.stderr, "Evaluating results"
        #Stream.setIndent(" ")
        bestResult = None
        if evaluator == None:
            evaluator = self.defaultEvaluator
        for i in range(len(combinations)):
            id = trained[i].parameterIdStr
            #Stream.setIndent(" ")
            # Get predictions
            predictions = None
            if trained[i].getStatus() == "FINISHED":
                predictions = trained[i].downloadPredictions()
            else:
                print >> sys.stderr, "No results for combination" + id
                continue
            if downloadAllModels:
                trained[i].downloadModel()
            # Compare to other results
            print >> sys.stderr, "*** Evaluating results for combination" + id + " ***"
            threshold = None
            if determineThreshold:
                print >> sys.stderr, "Thresholding, original micro =",
                evaluation = evaluator.evaluate(
                    classifyExamples,
                    predictions,
                    classIds,
                    os.path.join(outDir,
                                 "evaluation-before-threshold" + id + ".csv"),
                    verbose=False)
                print >> sys.stderr, evaluation.microF.toStringConcise()
                threshold, bestF = evaluator.threshold(classifyExamples,
                                                       predictions)
                print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(
                    bestF)[0:6]
            evaluation = evaluator.evaluate(
                classifyExamples,
                ExampleUtils.loadPredictions(predictions, threshold=threshold),
                classIds, os.path.join(outDir, "evaluation" + id + ".csv"))
            if bestResult == None or evaluation.compare(
                    bestResult[0]
            ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
                bestResult = [
                    evaluation, trained[i], combinations[i], threshold
                ]
            if not self.connection.isLocal():
                os.remove(predictions)  # remove predictions to save space
        #Stream.setIndent()
        if bestResult == None:
            raise Exception("No results for any parameter combination")
        print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***"
        print >> sys.stderr, "Selected parameters", bestResult[2]
        classifier = copy.copy(bestResult[1])
        classifier.threshold = bestResult[3]
        classifier.downloadModel()
        return classifier