Example #1
0
 def test(cls, examples, modelPath, output=None, parameters=None, timeout=None):
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with All-True Classifier"
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with All-True Classifier"
         testPath = examples
         examples = Example.readExamples(examples,False)
     print >> sys.stderr, "Note! Classification must be binary"
     #examples, predictions = self.filterClassificationSet(examples, True)
     predictions = []
     for example in examples:
         #predictions.append( (example, example[1]) )
         predictions.append( [2] ) #[example[1]] )
     
     if output == None:
         output = "predictions"
     f = open(output, "wt")
     for p in predictions:
         f.write(str(p[0])+"\n")
     f.close()
         
     return predictions
Example #2
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(
            SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse,
            SharedTaskEvaluator.tokenization)
        # Build interaction xml
        xml = BioTextExampleWriter.write(
            examples, predictions, SharedTaskEvaluator.corpusElements, None,
            SharedTaskEvaluator.ids + ".class_names",
            SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        # Convert to GENIA format
        gifxmlToGenia(xml,
                      SharedTaskEvaluator.geniaDir,
                      task=SharedTaskEvaluator.task,
                      verbose=False)
        # Use GENIA evaluation tool
        self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir,
                                          task=SharedTaskEvaluator.task,
                                          evaluations=["approximate"],
                                          verbose=False)
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()
        
        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
Example #4
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
 def classify(self,
              examples,
              output,
              model=None,
              finishBeforeReturn=False,
              replaceRemoteFiles=True):
     output = os.path.abspath(output)
     # Get examples
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(
             examples), "with All-Correct Classifier"
     else:
         print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier"
         examples = self.getExampleFile(examples,
                                        upload=False,
                                        replaceRemote=False,
                                        dummy=False)
         examples = Example.readExamples(examples, False)
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     # Classify
     f = open(output, "wt")
     for example in examples:
         f.write(str(example[1]) + "\n")
     f.close()
     classifier.predictions = output
     return classifier
Example #6
0
    def test(cls,
             examples,
             modelPath,
             output=None,
             parameters=None,
             timeout=None):
        if type(examples) == types.ListType:
            print >> sys.stderr, "Classifying", len(
                examples), "with All-Correct Classifier"
            examples, predictions = self.filterClassificationSet(
                examples, False)
            testPath = self.tempDir + "/test.dat"
            Example.writeExamples(examples, testPath)
        else:
            print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier"
            testPath = examples
            examples = Example.readExamples(examples, False)
        #examples, predictions = self.filterClassificationSet(examples, True)
        predictions = []
        for example in examples:
            #predictions.append( (example, example[1]) )
            predictions.append([example[1]])

        if output == None:
            output = "predictions"
        f = open(output, "wt")
        for p in predictions:
            f.write(str(p[0]) + "\n")
        f.close()

        return predictions
Example #7
0
    def __init__(self, examples=None, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"])

        self.classSet = classSet
        self.results = None
        self.internal = None
        if predictions != None:
            for example in examples:
                if example[3] != None:
                    print >> sys.stderr, "ChemProt Evaluator:"
                    self._calculateExamples(examples, predictions)
                else:
                    print >> sys.stderr, "No example extra info, skipping ChemProt evaluation"
                break
            self.internal = AveragingMultiClassEvaluator(
                examples, predictions, classSet)
            print >> sys.stderr, "AveragingMultiClassEvaluator:"
            print >> sys.stderr, self.internal.toStringConcise()
Example #8
0
 def loadExamples(self, examples, predictions):
     if type(predictions) == types.StringType:
         print >> sys.stderr, "Reading predictions from", predictions
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:
         print >> sys.stderr, "Reading examples from", examples
         examples = ExampleUtils.readExamples(examples, False)
     return examples, predictions
Example #9
0
 def loadExamples(self, examples, predictions):
     if type(predictions) == types.StringType:
         print >> sys.stderr, "Reading predictions from", predictions
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:
         print >> sys.stderr, "Reading examples from", examples
         examples = ExampleUtils.readExamples(examples, False)
     return examples, predictions
 def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if forceInternal or Settings.SVMMultiClassDir == None:
         return cls.testInternal(examples, modelPath, output)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = cls.stripComments(examples)
         examples = Example.readExamples(examples,False)
     args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"]
     if modelPath == None:
         modelPath = "model"
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],"classifier/model")
             del parameters["predefined"]
         self.__addParametersToSubprocessCall(args, parameters)
     if output == None:
         output = "predictions"
         logFile = open("svmmulticlass.log","at")
     else:
         logFile = open(output+".log","wt")
     args += [testPath, modelPath, output]
     #if timeout == None:
     #    timeout = -1
     #print args
     subprocess.call(args, stdout = logFile, stderr = logFile)
     predictionsFile = open(output, "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     predictions = []
     for i in range(len(lines)):
         predictions.append( [int(lines[i].split()[0])] + lines[i].split()[1:] )
         #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) )
     print >> sys.stderr, timer.toString()
     return predictions
Example #11
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
Example #12
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
Example #13
0
    def write(
        cls,
        examples,
        predictions,
        corpus,
        outputFile,
        classSet=None,
        parse=None,
        tokenization=None,
        goldCorpus=None,
        insertWeights=False,
    ):
        if type(examples) == types.StringType:
            print >>sys.stderr, "Reading examples from", examples
            examples = ExampleUtils.readExamples(examples, False)

        # This looks a bit strange, but should work with the re-iterable
        # generators that readExamples returns
        xType = None
        for example in examples:
            assert example[3].has_key("xtype")
            xType = example[3]["xtype"]
            break

        if xType == "token":
            w = EntityExampleWriter()
            if insertWeights:
                w.insertWeights = True
        elif xType == "edge":
            w = EdgeExampleWriter()
        elif xType == "task3":
            w = ModifierExampleWriter()
        elif xType == "entRel":
            w = EntityRelationExampleWriter()
        elif xType == "phrase":
            w = PhraseTriggerExampleWriter()
        # IF LOCAL
        elif xType == "um":
            w = UnmergingExampleWriter()
        # elif xType == "ue":
        #    w = UnmergedEdgeExampleWriter()
        # elif xType == "asym":
        #    w = AsymmetricEventExampleWriter()
        # ENDIF
        else:
            assert False, ("Unknown entity type", xType)
        return w.writeXML(
            examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus
        )
Example #14
0
 def __init__(self, examples, predictions=None, classSet=None):
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     
     SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Build interaction xml
     xml = BioTextExampleWriter.write(examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids+".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Convert to GENIA format
     gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False)
     # Use GENIA evaluation tool
     self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
Example #15
0
 def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True):
     output = os.path.abspath(output)
     # Get examples
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with All-Correct Classifier"
     else:
         print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier"
         examples = self.getExampleFile(examples, upload=False, replaceRemote=False, dummy=False)
         examples = Example.readExamples(examples, False)
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     # Classify
     f = open(output, "wt")
     for example in examples:
         f.write(str(example[1]) + "\n")
     f.close()
     classifier.predictions = output
     return classifier
Example #16
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        self.dataByClass = defaultdict(EvaluationData)

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        self.dataByClass = defaultdict(EvaluationData)
        
        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
Example #18
0
 def __init__(self, examples=None, predictions=None, classSet=None):
     if type(classSet) == types.StringType:  # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType:  # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:  # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     #self.examples = examples
     #self.predictions = predictions
     self.truePositives = 0
     self.falsePositives = 0
     self.trueNegatives = 0
     self.falseNegatives = 0
     self.precision = None
     self.recall = None
     self.fScore = None
     self.AUC = None
     self.type = "binary"
     if predictions != None:
         self._calculate(examples, predictions)
Example #19
0
 def __init__(self, examples, predictions=None, classSet=None):
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     
     corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization)
     # Build interaction xml
     xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization)
     xml = ix.splitMergedElements(xml, None)
     xml = ix.recalculateIds(xml, None, True)
     #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Convert to GENIA format
     STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2")
     #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False)
     # Use GENIA evaluation tool
     self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag)
     corpusElements = None
Example #20
0
 def __init__(self, examples=None, predictions=None, classSet=None):
     if type(classSet) == types.StringType:  # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType:  # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:  # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     # self.examples = examples
     # self.predictions = predictions
     self.truePositives = 0
     self.falsePositives = 0
     self.trueNegatives = 0
     self.falseNegatives = 0
     self.precision = None
     self.recall = None
     self.fScore = None
     self.AUC = None
     self.type = "binary"
     if predictions != None:
         self._calculate(examples, predictions)
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0
    
    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)
    
    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId
    
    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example ("+example[0]+"): ")
        features = example[2]
        for i in range(len(weightFeatures)-1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0

    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)

    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId

    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example (" + example[0] + "): ")
        features = example[2]
        for i in range(len(weightFeatures) - 1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
    def threshold(cls, examples, predictions):
        # Make negative confidence score / true class pairs
        if type(examples) in types.StringTypes:
            examples = ExampleUtils.readExamples(examples, False)
        if type(predictions) in types.StringTypes:
            predictions = ExampleUtils.loadPredictions(predictions)
        pairs = []
        realPositives = 0
        for example, prediction in itertools.izip(examples, predictions):
            trueClass = example[1]
            assert (trueClass > 0
                    )  # multiclass classification uses non-negative integers
            if trueClass > 1:
                realPositives += 1
            negClassValue = prediction[1]
            pairs.append((negClassValue, trueClass))
        pairs.sort(reverse=True)
        realNegatives = len(pairs) - realPositives

        # When starting thresholding, all examples are considered positive
        binaryF = EvaluationData()
        binaryF._tp = realPositives
        binaryF._fp = realNegatives
        binaryF._fn = 0
        binaryF.calculateFScore()
        fscore = binaryF.fscore
        threshold = pairs[0][0] - 1.

        # Turn one example negative at a time
        for pair in pairs:
            if pair[1] == 1:  # the real class is negative
                binaryF._fp -= 1  # false positive -> true negative
            else:  # the real class is a positive class
                binaryF._tp -= 1  # true positive -> ...
                binaryF._fn += 1  # ... false negative
            binaryF.calculateFScore()
            if binaryF.fscore > fscore:
                fscore = binaryF.fscore
                threshold = pair[0] + 0.00000001
        return threshold, fscore
 def threshold(cls, examples, predictions):
     # Make negative confidence score / true class pairs
     if type(examples) in types.StringTypes:
         examples = ExampleUtils.readExamples(examples, False)
     if type(predictions) in types.StringTypes:
         predictions = ExampleUtils.loadPredictions(predictions)
     pairs = []
     realPositives = 0
     for example, prediction in itertools.izip(examples, predictions):
         trueClass = example[1]
         assert(trueClass > 0) # multiclass classification uses non-negative integers
         if trueClass > 1:
             realPositives += 1
         negClassValue = prediction[1]
         pairs.append( (negClassValue, trueClass) )
     pairs.sort(reverse=True)
     realNegatives = len(pairs) - realPositives
     
     # When starting thresholding, all examples are considered positive
     binaryF = EvaluationData()
     binaryF._tp = realPositives
     binaryF._fp = realNegatives
     binaryF._fn = 0
     binaryF.calculateFScore()
     fscore = binaryF.fscore
     threshold = pairs[0][0]-1.
     
     # Turn one example negative at a time
     for pair in pairs:
         if pair[1] == 1: # the real class is negative
             binaryF._fp -= 1 # false positive -> true negative
         else: # the real class is a positive class
             binaryF._tp -= 1 # true positive -> ...
             binaryF._fn += 1 # ... false negative
         binaryF.calculateFScore()
         if binaryF.fscore > fscore:
             fscore = binaryF.fscore
             threshold = pair[0]+0.00000001
     return threshold, fscore        
Example #25
0
    def __init__(self, examples=None, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)
        
        self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"])

        self.classSet = classSet
        self.results = None
        self.internal = None
        if predictions != None:
            for example in examples:
                if example[3] != None:
                    print >> sys.stderr, "ChemProt Evaluator:"
                    self._calculateExamples(examples, predictions)
                else:
                    print >> sys.stderr, "No example extra info, skipping ChemProt evaluation"
                break
            self.internal = AveragingMultiClassEvaluator(examples, predictions, classSet)
            print >> sys.stderr, "AveragingMultiClassEvaluator:"
            print >> sys.stderr, self.internal.toStringConcise()
Example #26
0
    def write(cls, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False):
        if type(examples) == types.StringType:
            print >> sys.stderr, "Reading examples from", examples
            examples = ExampleUtils.readExamples(examples, False)
        
        # This looks a bit strange, but should work with the re-iterable
        # generators that readExamples returns
        xType = None
        for example in examples:
            assert example[3].has_key("xtype")
            xType = example[3]["xtype"]
            break
        
        if xType == "token":
            w = EntityExampleWriter()
            if insertWeights:
                w.insertWeights = True
        elif xType == "edge":
            w = EdgeExampleWriter()
        elif xType == "task3":
            w = ModifierExampleWriter()
        elif xType == "entRel":
            w = EntityRelationExampleWriter()
        elif xType == "phrase":
            w = PhraseTriggerExampleWriter()
#IF LOCAL
        elif xType == "um":
            w = UnmergingExampleWriter()
        #elif xType == "ue":
        #    w = UnmergedEdgeExampleWriter()
        #elif xType == "asym":
        #    w = AsymmetricEventExampleWriter()
#ENDIF
        else:
            assert False, ("Unknown entity type", xType)
        return w.writeXML(examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus)
Example #27
0
 def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if type(parameters) == types.StringType:
         parameters = splitParameters(parameters)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = examples
         examples = Example.readExamples(examples,False)
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],"classifier/model")
             del parameters["predefined"]
     # Read model
     if modelPath == None:
         modelPath = "model-multilabel"
     classModels = {}
     if modelPath.endswith(".gz"):
         f = gzip.open(modelPath, "rt")
     else:
         f = open(modelPath, "rt")
     thresholds = {}
     for line in f:
         key, value, threshold = line.split()
         classModels[key] = value
         if threshold != "None":
             thresholds[key] = float(threshold)
         else:
             thresholds[key] = 0.0
     f.close()
     mergedPredictions = []
     if type(classIds) == types.StringType:
         classIds = IdSet(filename=classIds)
     #print classModels
     print "Thresholds", thresholds
     classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify"
     print parameters
     if "classifier" in parameters and "svmperf" in parameters["classifier"]:
         classifierBin = Settings.SVMPerfDir+"/svm_perf_classify"
         parameters = copy.copy(parameters)
         del parameters["classifier"]
     for className in classIds.getNames():
         if className != "neg" and not "---" in className:
             classId = classIds.getId(className)
             if thresholds[str(className)] != 0.0:
                 print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)]
             else:
                 print >> sys.stderr, "Classifying", className
             args = [classifierBin]
             #self.__addParametersToSubprocessCall(args, parameters)
             classOutput = "predictions" + ".cls-" + className
             logFile = open("svmmulticlass" + ".cls-" + className + ".log","at")
             args += [testPath, classModels[str(className)], classOutput]
             print args
             subprocess.call(args, stdout = logFile, stderr = logFile)
             cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)])
     print >> sys.stderr, timer.toString()
     
     predFileName = output
     f = open(predFileName, "wt")
     for mergedPred in mergedPredictions:
         if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
             mergedPred[0].remove("1")
         mergedPred[1] = str(mergedPred[1])
         mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
         f.write(" ".join(mergedPred) + "\n")
     f.close()
     
     return mergedPredictions
def compareExamples(examples1, examples2, features1, features2=None):
    ExampleUtils.readExamples(examples1)
    exampleIter1 = ExampleUtils.readExamples(examples1)
    exampleIter2 = ExampleUtils.readExamples(examples2)
    features1 = IdSet(filename=features1)
    if features2 != None:
        features2 = IdSet(filename=features2)
    else:
        features2 = features1
    # Compare feature sets
    if set(features1.Ids.keys()) != set(features2.Ids.keys()):
        print "Feature sets differ"
    # Compare examples
    counter = ProgressCounter(step=1)
    for e1, e2 in itertools.izip(exampleIter1, exampleIter2):
        counter.update()
        assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2))
        if e1[1] != e2[1]:
            print "Class differs"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
        f1 = getFeatureNames(e1, features1)
        f2 = getFeatureNames(e2, features2)
        f1Set = set(f1)
        f2Set = set(f2)
        f1Only = f1Set.difference(f2Set)
        f2Only = f2Set.difference(f1Set)
        if len(f1Only) > 0 or len(f2Only) > 0:
            print "Features differ"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
            if len(f1Only) > 0:
                print "  E1-only features:", f1Only
            if len(f2Only) > 0:
                print "  E2-only features:", f2Only
        else:
            assert len(f1) == len(f2)
            fCount = 0
            differ = False
            for feature1, feature2 in zip(f1, f2):
                #f1Id = features1.getId(feature1, createIfNotExist=False)
                #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation":
                #    print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id]
                if feature1 != feature2:
                    if not differ:
                        print "Feature order differs for example", e1[0]
                        differ = True
                    print "[" + feature1 + "/" + feature2 + "](" + str(
                        fCount) + ") ",
                else:
                    f1Id = features1.getId(feature1, createIfNotExist=False)
                    f2Id = features2.getId(feature2, createIfNotExist=False)
                    f1Value = e1[2][f1Id]
                    f2Value = e2[2][f2Id]
                    if f1Value != f2Value:
                        if not differ:
                            print "Feature values differ", e1[0]
                            differ = True
                        print "[" + feature1 + "/" + str(
                            f1Id) + "]" + "[" + str(f1Value) + "/" + str(
                                f2Value) + "]" + "(" + str(fCount) + ") ",
                fCount += 1
            if differ:
                print
    counter.endUpdate()
Example #29
0
def compareExamples(examples1, examples2, features1, features2=None):
    ExampleUtils.readExamples(examples1)
    exampleIter1 = ExampleUtils.readExamples(examples1)
    exampleIter2 = ExampleUtils.readExamples(examples2)
    features1 = IdSet(filename=features1)
    if features2 != None:
        features2 = IdSet(filename=features2)
    else:
        features2 = features1
    # Compare feature sets
    if set(features1.Ids.keys()) != set(features2.Ids.keys()):
        print "Feature sets differ"
    # Compare examples
    counter = ProgressCounter(step=1)
    for e1, e2 in itertools.izip(exampleIter1, exampleIter2):
        counter.update()
        assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2))
        if e1[1] != e2[1]:
            print "Class differs"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
        f1 = getFeatureNames(e1, features1)
        f2 = getFeatureNames(e2, features2)
        f1Set = set(f1)
        f2Set = set(f2)
        f1Only = f1Set.difference(f2Set)
        f2Only = f2Set.difference(f1Set)
        if len(f1Only) > 0 or len(f2Only) > 0:
            print "Features differ"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
            if len(f1Only) > 0:
                print "  E1-only features:", f1Only
            if len(f2Only) > 0:
                print "  E2-only features:", f2Only
        else:
            assert len(f1) == len(f2)
            fCount = 0
            differ = False
            for feature1, feature2 in zip(f1, f2):
                #f1Id = features1.getId(feature1, createIfNotExist=False)
                #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation":
                #    print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id]
                if feature1 != feature2:
                    if not differ:
                        print "Feature order differs for example", e1[0]
                        differ = True
                    print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ",
                else:
                    f1Id = features1.getId(feature1, createIfNotExist=False)
                    f2Id = features2.getId(feature2, createIfNotExist=False)
                    f1Value = e1[2][f1Id]
                    f2Value = e2[2][f2Id]
                    if f1Value != f2Value:
                        if not differ:
                            print "Feature values differ", e1[0]
                            differ = True
                        print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ",
                fCount += 1              
            if differ:
                print
    counter.endUpdate()
Example #30
0
    def test(cls,
             examples,
             modelPath,
             output=None,
             parameters=None,
             forceInternal=False,
             classIds=None):  # , timeout=None):
        """
        Classify examples with a pre-trained model.
        
        @type examples: string (filename) or list (or iterator) of examples
        @param examples: a list or file containing examples in SVM-format
        @type modelPath: string
        @param modelPath: filename of the pre-trained model file
        @type parameters: a dictionary or string
        @param parameters: parameters for the classifier
        @type output: string
        @param output: the name of the predictions file to be written
        @type forceInternal: Boolean
        @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
        """
        if type(parameters) == types.StringType:
            parameters = splitParameters(parameters)
        timer = Timer()
        if type(examples) == types.ListType:
            print >> sys.stderr, "Classifying", len(
                examples), "with SVM-MultiClass model", modelPath
            examples, predictions = self.filterClassificationSet(
                examples, False)
            testPath = self.tempDir + "/test.dat"
            Example.writeExamples(examples, testPath)
        else:
            print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
            testPath = examples
            examples = Example.readExamples(examples, False)
        if parameters != None:
            parameters = copy.copy(parameters)
            if parameters.has_key("c"):
                del parameters["c"]
            if parameters.has_key("predefined"):
                parameters = copy.copy(parameters)
                modelPath = os.path.join(parameters["predefined"][0],
                                         "classifier/model")
                del parameters["predefined"]
        # Read model
        if modelPath == None:
            modelPath = "model-multilabel"
        classModels = {}
        if modelPath.endswith(".gz"):
            f = gzip.open(modelPath, "rt")
        else:
            f = open(modelPath, "rt")
        thresholds = {}
        for line in f:
            key, value, threshold = line.split()
            classModels[key] = value
            if threshold != "None":
                thresholds[key] = float(threshold)
            else:
                thresholds[key] = 0.0
        f.close()
        mergedPredictions = []
        if type(classIds) == types.StringType:
            classIds = IdSet(filename=classIds)
        #print classModels
        print "Thresholds", thresholds
        classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify"
        print parameters
        if "classifier" in parameters and "svmperf" in parameters["classifier"]:
            classifierBin = Settings.SVMPerfDir + "/svm_perf_classify"
            parameters = copy.copy(parameters)
            del parameters["classifier"]
        for className in classIds.getNames():
            if className != "neg" and not "---" in className:
                classId = classIds.getId(className)
                if thresholds[str(className)] != 0.0:
                    print >> sys.stderr, "Classifying", className, "with threshold", thresholds[
                        str(className)]
                else:
                    print >> sys.stderr, "Classifying", className
                args = [classifierBin]
                #self.__addParametersToSubprocessCall(args, parameters)
                classOutput = "predictions" + ".cls-" + className
                logFile = open("svmmulticlass" + ".cls-" + className + ".log",
                               "at")
                args += [testPath, classModels[str(className)], classOutput]
                print args
                subprocess.call(args, stdout=logFile, stderr=logFile)
                cls.addPredictions(classOutput,
                                   mergedPredictions,
                                   classId,
                                   len(classIds.Ids),
                                   threshold=thresholds[str(className)])
        print >> sys.stderr, timer.toString()

        predFileName = output
        f = open(predFileName, "wt")
        for mergedPred in mergedPredictions:
            if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
                mergedPred[0].remove("1")
            mergedPred[1] = str(mergedPred[1])
            mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
            f.write(" ".join(mergedPred) + "\n")
        f.close()

        return mergedPredictions
Example #31
0
 def determineThreshold(self, examples, predictions):
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
         
     examplesByClass = {}
     for cls in self.classes:
         examplesByClass[cls] = []
     # prepare examples
     for example, prediction in itertools.izip(examples, predictions):
         # Check true class for multilabel
         trueClass = example[1]
         trueClassName = self.classSet.getName(trueClass)
         assert(trueClass > 0) # multiclass classification uses non-negative integers
         if "---" in trueClassName:
             trueClass = set()
             for name in trueClassName.split("---"):
                 trueClass.add(self.classSet.getId(name))
         else:
             trueClass = [trueClass]
         # Check prediction for multilabel
         predictedClasses = prediction[0]
         if type(predictedClasses) == types.IntType:
             predictedClasses = [predictedClasses]
         
         for predType in predictedClasses:
             if predType != 1:
                 exTrueClass = 1
                 if predType in trueClass:
                     exTrueClass = 2
                 examplesByClass[predType].append( (prediction[predType], exTrueClass, 2) )
         # positives are negatives for other classes
         for cls in self.classes:
             if cls not in predictedClasses:
                 exTrueClass = 1
                 if cls in trueClass:
                     exTrueClass = 2
                 examplesByClass[cls].append( (prediction[cls], exTrueClass, 1) )
     # do the thresholding
     thresholdByClass = {}
     for cls in self.classes:
         if cls == 1:
             continue
         thresholdByClass[cls] = 0.0
         examplesByClass[cls].sort()
         # Start with all below zero being negative, and all above it being what is predicted
         ev = EvaluationData()
         for example in examplesByClass[cls]:
             #print example
             if example[0] < 0.0:
                 updateF(ev, example[1], 2, 1) # always negative
             else:
                 updateF(ev, example[1], example[2], 1) # what is predicted
         count = 0
         bestF = [self.dataByClass[cls].fscore, None, (0.0, None), None]
         for example in examplesByClass[cls]:
             if example[0] < 0.0:
                 # Remove original example
                 updateF(ev, example[1], 2, -1)
                 # Add new example
                 updateF(ev, example[1], example[2], 1)
                 # Calculate F for this point
             else:
                 # Remove original example
                 updateF(ev, example[1], example[2], -1)
                 # Add new example
                 updateF(ev, example[1], 1, 1)
                 # Calculate F for this point
             ev.calculateFScore()
             #print example, ev.toStringConcise()
             count += 1
             #if self.classSet.getName(cls) == "Binding":
             #    print count, example, ev.toStringConcise()
             if ev.fscore > bestF[0]:
                 bestF = (ev.fscore, count, example, ev.toStringConcise())
                 self.dataByClass[cls] = copy.copy(ev)
         print >> sys.stderr, "Threshold", self.classSet.getName(cls), bestF
         if bestF[2][0] != 0.0:
             thresholdByClass[cls] = bestF[2][0] + 0.00000001
         else:
             thresholdByClass[cls] = 0.0
     #print thresholdByClass
     self.thresholds = thresholdByClass
     #self._calculate(examples, predictions, thresholdByClass)
     #print >> sys.stderr, "Optimal", self.toStringConcise()
     return thresholdByClass
Example #32
0
    def determineThreshold(self, examples, predictions):
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        examplesByClass = {}
        for cls in self.classes:
            examplesByClass[cls] = []
        # prepare examples
        for example, prediction in itertools.izip(examples, predictions):
            # Check true class for multilabel
            trueClass = example[1]
            trueClassName = self.classSet.getName(trueClass)
            assert (trueClass > 0
                    )  # multiclass classification uses non-negative integers
            if "---" in trueClassName:
                trueClass = set()
                for name in trueClassName.split("---"):
                    trueClass.add(self.classSet.getId(name))
            else:
                trueClass = [trueClass]
            # Check prediction for multilabel
            predictedClasses = prediction[0]
            if type(predictedClasses) == types.IntType:
                predictedClasses = [predictedClasses]

            for predType in predictedClasses:
                if predType != 1:
                    exTrueClass = 1
                    if predType in trueClass:
                        exTrueClass = 2
                    examplesByClass[predType].append(
                        (prediction[predType], exTrueClass, 2))
            # positives are negatives for other classes
            for cls in self.classes:
                if cls not in predictedClasses:
                    exTrueClass = 1
                    if cls in trueClass:
                        exTrueClass = 2
                    examplesByClass[cls].append(
                        (prediction[cls], exTrueClass, 1))
        # do the thresholding
        thresholdByClass = {}
        for cls in self.classes:
            if cls == 1:
                continue
            thresholdByClass[cls] = 0.0
            examplesByClass[cls].sort()
            # Start with all below zero being negative, and all above it being what is predicted
            ev = EvaluationData()
            for example in examplesByClass[cls]:
                #print example
                if example[0] < 0.0:
                    updateF(ev, example[1], 2, 1)  # always negative
                else:
                    updateF(ev, example[1], example[2], 1)  # what is predicted
            count = 0
            bestF = [self.dataByClass[cls].fscore, None, (0.0, None), None]
            for example in examplesByClass[cls]:
                if example[0] < 0.0:
                    # Remove original example
                    updateF(ev, example[1], 2, -1)
                    # Add new example
                    updateF(ev, example[1], example[2], 1)
                    # Calculate F for this point
                else:
                    # Remove original example
                    updateF(ev, example[1], example[2], -1)
                    # Add new example
                    updateF(ev, example[1], 1, 1)
                    # Calculate F for this point
                ev.calculateFScore()
                #print example, ev.toStringConcise()
                count += 1
                #if self.classSet.getName(cls) == "Binding":
                #    print count, example, ev.toStringConcise()
                if ev.fscore > bestF[0]:
                    bestF = (ev.fscore, count, example, ev.toStringConcise())
                    self.dataByClass[cls] = copy.copy(ev)
            print >> sys.stderr, "Threshold", self.classSet.getName(cls), bestF
            if bestF[2][0] != 0.0:
                thresholdByClass[cls] = bestF[2][0] + 0.00000001
            else:
                thresholdByClass[cls] = 0.0
        #print thresholdByClass
        self.thresholds = thresholdByClass
        #self._calculate(examples, predictions, thresholdByClass)
        #print >> sys.stderr, "Optimal", self.toStringConcise()
        return thresholdByClass
Example #33
0
    optparser.add_option("-i",
                         "--invariant",
                         default=None,
                         dest="invariant",
                         help="Corpus in analysis format",
                         metavar="FILE")
    optparser.add_option("-v",
                         "--variant",
                         default=None,
                         dest="variant",
                         help="Corpus in analysis format",
                         metavar="FILE")
    (options, args) = optparser.parse_args()

    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(
        os.path.join(options.variant, "test-triggers.examples"))

    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(
        os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(
        os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(
        os.path.join(options.variant, "test-triggers.examples.class_names"))

    counter = ProgressCounter(len(variantExamples))
Example #34
0
def threshold(examples, predictionsDir=None, classSet=None):
    if type(classSet) == types.StringType: # class names are in file
        classSet = IdSet(filename=classSet)
    classIds = set()
    if type(examples) == types.StringType: # examples are in file
        examplesTemp = ExampleUtils.readExamples(examples, False)
        examples = []
        for example in examplesTemp:
            examples.append(example)
            classIds.add(example[1])
    classIds = list(classIds)
    classIds.sort()
    
    #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet)
    #multilabel._calculate(examples, predictions)
    #print multilabel.toStringConcise(title="multilabel")
    
    bestThrF = [0]
    bestBaseF = [0]
    predFileNames = []
    for filename in os.listdir(predictionsDir):
        if "predictions" in filename:
            predFileNames.append( (int(filename.rsplit("_")[-1]), filename) )
    predFileNames.sort()
    for predFileName in predFileNames:
        predictionsTemp = ExampleUtils.loadPredictions(os.path.join(predictionsDir, predFileName[1]))
        predictions = []
        for prediction in predictionsTemp:
            predictions.append(prediction)
    
        baseEv = AveragingMultiClassEvaluator(None, None, classSet)
        baseEv._calculate(examples, predictions)
        print "============================"
        print predFileName[1]
        print "============================"
        #print baseEv.toStringConcise(title="baseline")
        
        baseLineF = baseEv.microF.fscore
        for step in [0]:
            for classId in [1]: #classIds:
                cls = None
                if classSet != None:
                    cls = classSet.getName(classId)
                else:
                    cls = str(classId)
                bestF = thresholdClass(examples, predictions, classId, baseLineF)
                for prediction in predictions:
                    prediction[classId] -= bestF[2][0] + 0.00000001
                changed = 0
                for prediction in predictions:
                    maxVal = -999999
                    maxClass = None
                    for i in range(1, len(prediction)):
                        if prediction[i] > maxVal:
                            maxVal = prediction[i]
                            maxClass = i
                    if maxClass != prediction[0]:
                        prediction[0] = maxClass
                        changed += 1
                print step, cls, "changed", changed, bestF[0]
                baseLineF = bestF[0]
        
        if bestF[0] > bestThrF[0]:
            bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2], bestF[3])
        if baseEv.microF.fscore > bestBaseF[0]:
            bestBaseF = (baseEv.microF.fscore, predFileName[1], baseEv.microF.toStringConcise())
    
        print "-------- Baseline ------------"
        print baseEv.toStringConcise()
        print "-------- Best ------------"
        print bestF[0], bestF[1], bestF[2]
        print bestF[3]
        thEv = AveragingMultiClassEvaluator(None, None, classSet)
        thEv._calculate(examples, predictions)
        print thEv.toStringConcise()
    
    print "=============== All Best ==============="
    print "Threshold", bestThrF
    print "Base", bestBaseF
    sys.exit()
    
    memPredictions = []
    bestEv = baseEv
    bestPair = [None, None, None]
    for p in predictions:
        memPredictions.append(p)
    for pair in pairs:
        modifier = pair[0] + 0.00000001
        changedClass = 0
        for pred in memPredictions:
            negPred = pred[1] - modifier  
            maxVal = negPred
            maxClass = 1
            for i in range(2, len(pred)):
                if pred[i] > maxVal:
                    maxVal = pred[i]
                    maxClass = i
            if pred[0] != maxClass:
                changedClass += 1
            pred[0] = maxClass
        ev = AveragingMultiClassEvaluator(None)
        ev._calculate(examples, memPredictions)
        print pair[0], pair[2], changedClass
        print ev.toStringConcise()
        if ev.compare(bestEv) == 1:
            print "Improved"
            bestPair = pair
            bestEv = ev
    
    print "---------------------------------------------"
    print baseEv.toStringConcise(title="baseline")
    print bestPair[0], bestPair[2] 
    print bestEv.toStringConcise(title="best") 
Example #35
0
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    
    defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples"))
    
    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names"))
    
    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
Example #36
0
        print >> sys.stderr, "Psyco not installed"

    from optparse import OptionParser
    import os
    optparser = OptionParser(description="Analyze SVM example files")
    optparser.add_option("-s", "--source", default=None, dest="source", help="examples", metavar="FILE")
    optparser.add_option("-t", "--target", default=None, dest="target", help="examples")
    optparser.add_option("-f", "--sourceFeatureIds", default=None, dest="sourceFeatureIds", help="examples", metavar="FILE")
    optparser.add_option("-g", "--targetFeatureIds", default=None, dest="targetFeatureIds", help="examples")
    (options, args) = optparser.parse_args()
    
    print "Loading ids"
    sFeatIds = IdSet(filename=options.sourceFeatureIds)
    tFeatIds = IdSet(filename=options.targetFeatureIds)
    print "Loading examples"
    sExamples = ExampleUtils.readExamples(options.source)
    tExamples = ExampleUtils.readExamples(options.target)
    print "Making name sets"
    s = getFeatureNames(sExamples, sFeatIds)
    t = getFeatureNames(tExamples, tFeatIds)
    print "Source features:", len(s)
    print "Target features:", len(t)
    print "Intersection:", len(s & t)
    onlyS = s - t
    onlyT = t - s
    print "Only source:", len(onlyS)
    print "Only target:", len(onlyT)
#    state = {}
#    for n in onlyS:
#        presence = state.setdefault(n, [0,0])
#        presence[0] = 1
Example #37
0
 def test(cls,
          examples,
          modelPath,
          output=None,
          parameters=None,
          forceInternal=False):  # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if forceInternal or Settings.SVMMultiClassDir == None:
         return cls.testInternal(examples, modelPath, output)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(
             examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(
             examples, False)
         testPath = self.tempDir + "/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = cls.stripComments(examples)
         examples = Example.readExamples(examples, False)
     args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"]
     if modelPath == None:
         modelPath = "model"
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],
                                      "classifier/model")
             del parameters["predefined"]
         self.__addParametersToSubprocessCall(args, parameters)
     if output == None:
         output = "predictions"
         logFile = open("svmmulticlass.log", "at")
     else:
         logFile = open(output + ".log", "wt")
     args += [testPath, modelPath, output]
     #if timeout == None:
     #    timeout = -1
     #print args
     subprocess.call(args, stdout=logFile, stderr=logFile)
     predictionsFile = open(output, "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     predictions = []
     for i in range(len(lines)):
         predictions.append([int(lines[i].split()[0])] +
                            lines[i].split()[1:])
         #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) )
     print >> sys.stderr, timer.toString()
     return predictions
Example #38
0
def threshold(examples, predictionsDir=None, classSet=None):
    if type(classSet) == types.StringType:  # class names are in file
        classSet = IdSet(filename=classSet)
    classIds = set()
    if type(examples) == types.StringType:  # examples are in file
        examplesTemp = ExampleUtils.readExamples(examples, False)
        examples = []
        for example in examplesTemp:
            examples.append(example)
            classIds.add(example[1])
    classIds = list(classIds)
    classIds.sort()

    #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet)
    #multilabel._calculate(examples, predictions)
    #print multilabel.toStringConcise(title="multilabel")

    bestThrF = [0]
    bestBaseF = [0]
    predFileNames = []
    for filename in os.listdir(predictionsDir):
        if "predictions" in filename:
            predFileNames.append((int(filename.rsplit("_")[-1]), filename))
    predFileNames.sort()
    for predFileName in predFileNames:
        predictionsTemp = ExampleUtils.loadPredictions(
            os.path.join(predictionsDir, predFileName[1]))
        predictions = []
        for prediction in predictionsTemp:
            predictions.append(prediction)

        baseEv = AveragingMultiClassEvaluator(None, None, classSet)
        baseEv._calculate(examples, predictions)
        print "============================"
        print predFileName[1]
        print "============================"
        #print baseEv.toStringConcise(title="baseline")

        baseLineF = baseEv.microF.fscore
        for step in [0]:
            for classId in [1]:  #classIds:
                cls = None
                if classSet != None:
                    cls = classSet.getName(classId)
                else:
                    cls = str(classId)
                bestF = thresholdClass(examples, predictions, classId,
                                       baseLineF)
                for prediction in predictions:
                    prediction[classId] -= bestF[2][0] + 0.00000001
                changed = 0
                for prediction in predictions:
                    maxVal = -999999
                    maxClass = None
                    for i in range(1, len(prediction)):
                        if prediction[i] > maxVal:
                            maxVal = prediction[i]
                            maxClass = i
                    if maxClass != prediction[0]:
                        prediction[0] = maxClass
                        changed += 1
                print step, cls, "changed", changed, bestF[0]
                baseLineF = bestF[0]

        if bestF[0] > bestThrF[0]:
            bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2],
                        bestF[3])
        if baseEv.microF.fscore > bestBaseF[0]:
            bestBaseF = (baseEv.microF.fscore, predFileName[1],
                         baseEv.microF.toStringConcise())

        print "-------- Baseline ------------"
        print baseEv.toStringConcise()
        print "-------- Best ------------"
        print bestF[0], bestF[1], bestF[2]
        print bestF[3]
        thEv = AveragingMultiClassEvaluator(None, None, classSet)
        thEv._calculate(examples, predictions)
        print thEv.toStringConcise()

    print "=============== All Best ==============="
    print "Threshold", bestThrF
    print "Base", bestBaseF
    sys.exit()

    memPredictions = []
    bestEv = baseEv
    bestPair = [None, None, None]
    for p in predictions:
        memPredictions.append(p)
    for pair in pairs:
        modifier = pair[0] + 0.00000001
        changedClass = 0
        for pred in memPredictions:
            negPred = pred[1] - modifier
            maxVal = negPred
            maxClass = 1
            for i in range(2, len(pred)):
                if pred[i] > maxVal:
                    maxVal = pred[i]
                    maxClass = i
            if pred[0] != maxClass:
                changedClass += 1
            pred[0] = maxClass
        ev = AveragingMultiClassEvaluator(None)
        ev._calculate(examples, memPredictions)
        print pair[0], pair[2], changedClass
        print ev.toStringConcise()
        if ev.compare(bestEv) == 1:
            print "Improved"
            bestPair = pair
            bestEv = ev

    print "---------------------------------------------"
    print baseEv.toStringConcise(title="baseline")
    print bestPair[0], bestPair[2]
    print bestEv.toStringConcise(title="best")