def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        defaultParameters["no_arg_count_upper_limit"] = False
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
Example #2
0
    def __init__(self,
                 style=["typed", "directed"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        if style.find(",") != -1:
            style = style.split(",")
        self.styles = style

        self.negFrac = None
        self.posPairGaz = POSPairGazetteer()
        for s in style:
            if s.find("negFrac") != -1:
                self.negFrac = float(s.split("_")[-1])
                print >> sys.stderr, "Downsampling negatives to", self.negFrac
                self.negRand = random.Random(15)
            elif s.find("posPairGaz") != -1:
                self.posPairGaz = POSPairGazetteer(
                    loadFrom=s.split("_", 1)[-1])

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if "ontology" in self.styles:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if "nodalida" in self.styles:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        #IF LOCAL
        if "bioinfer_limits" in self.styles:
            self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        #ENDIF
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
Example #3
0
 def __init__(self, style):
     ExampleBuilder.__init__(self)
     self.featureBuilder = EdgeFeatureBuilder(self.featureSet)
     self.style = style
     if not "binary" in style:
         self.classSet = IdSet(1)
         assert (self.classSet.getId("neg") == 1)
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()
        
        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
    def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, ["trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"])
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
Example #6
0
    def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert classSet.getId("neg") == 1

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        # if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert self.pathLengths == None
        self.types = types
Example #7
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
Example #8
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     if "noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     if "random" in self.styles:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
Example #10
0
 def devectorizePredictions(self, predictions):
     """
     Converts a dense Numpy array of [examples][width][height][features] into
     the corresponding Python list matrices where features are stored in a key-value
     dictionary.
     """
     targetIds = IdSet(filename=self.model.get(self.tag+"ids.classes"), locked=True)
     dimMatrix = int(self.model.getStr("dimMatrix"))
     dimLabels = int(self.model.getStr("dimLabels"))
     predictions = reshape(predictions, (predictions.shape[0], dimMatrix, dimMatrix, dimLabels))
     rangeMatrix = range(dimMatrix)
     labels = np.argmax(predictions, axis=-1)
     values = np.max(predictions, axis=-1)
     minValue = np.min(values)
     maxValue = np.max(values)
     valRange = maxValue - minValue
     print "MINMAX", minValue, maxValue
     devectorized = []
     for exampleIndex in range(predictions.shape[0]):
         #print predictions[exampleIndex]
         devectorized.append([])
         for i in rangeMatrix:
             devectorized[-1].append([])
             for j in rangeMatrix:
                 features = {}
                 devectorized[-1][-1].append(features)
                 maxFeature = labels[exampleIndex][i][j]
                 predValue = predictions[exampleIndex][i][j][maxFeature]
                 features[targetIds.getName(maxFeature)] = float(predValue)
                 features["color"] = self.getColor((predValue - minValue) / valRange)
     return devectorized
Example #11
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        global speculationWords

        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        self.specWords, self.specWordStems = readWords(speculationWords)

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            self.gazetteer = None
        self.styles = self.getParameters(style, {
            "classification": "multiclass",
            "speculation_words": True
        }, {"classification": ("multiclass", "speculation", "negation")})
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
Example #13
0
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()
        
        ExampleBuilder.__init__(self, classSet, featureSet)
        assert( classSet.getId("neg") == 1 )
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName!=None:
            self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer=None
        self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", 
                                  "epi_merge_negated", "limit_merged_types", "genia_task1",
                                  "names", "build_for_nameless", "skip_for_nameless",
                                  "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", 
                                  "phospho", "drugbank_features", "ddi13_features", "metamap", 
                                  "only_types", "ontobiotope_features", "bb_spans", "w2v",
                                  "no_context"])
        self.styles = self.getParameters(style)
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()
        
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens()
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
        if self.styles["w2v"]:
            self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet)
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     if gazetteer != None:
         print >> sys.stderr, "Loading gazetteer from", gazetteer
         self.gazetteer=Gazetteer.loadGztr(gazetteer)
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     
     self.pathGazetteer=None
     self.pathGazetteerDependencies = None
     self.pathGazetteerPairs = None
     if pathGazetteer != None:
         print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
         self.pathGazetteer=PathGazetteer.load(pathGazetteer)
         self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer)
         self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer)
     else:
         print >> sys.stderr, "No path gazetteer loaded"
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     self.negFrac = negFrac
     print >> sys.stderr, "Downsampling negatives to", negFrac
     self.negRand = random.Random()
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     if True:#"noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     #if "ontology" in self.styles:
     #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     
     self.eventsByOrigId = {}
     self.headTokensByOrigId = {}
     self.interSentenceEvents = set()
     
     self.examplesByEventOrigId = {}
     self.skippedByType = {}
     self.skippedByTypeAndReason = {}
     self.builtByType = {}
     
     self.gazMatchCache = {}
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        assert (classSet.getId("neg") == 1)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self._setDefaultParameters([
            "rel_features", "wordnet", "bb_features", "giuliano",
            "epi_merge_negated", "limit_merged_types", "genia_task1",
            "build_for_nameless", "pos_only", "all_tokens", "names",
            "pos_pairs", "linear_ngrams", "phospho"
        ])
        self.styles = self.getParameters(style)
        #        if "selftrain_group" in self.styles:
        #            self.selfTrainGroups = set()
        #            if "selftrain_group-1" in self.styles:
        #                self.selfTrainGroups.add("-1")
        #            if "selftrain_group0" in self.styles:
        #                self.selfTrainGroups.add("0")
        #            if "selftrain_group1" in self.styles:
        #                self.selfTrainGroups.add("1")
        #            if "selftrain_group2" in self.styles:
        #                self.selfTrainGroups.add("2")
        #            if "selftrain_group3" in self.styles:
        #                self.selfTrainGroups.add("3")
        #            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(
            )
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()         
     ExampleBuilder.__init__(self, classSet, featureSet)
     
     self.styles = style
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder.useNonNameEntities = False
Example #17
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
Example #18
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
Example #19
0
def run(EvaluatorClass,
        inputCorpusFile,
        goldCorpusFile,
        parse,
        tokenization=None,
        target="both",
        entityMatchFunction=compareEntitiesSimple,
        removeIntersentenceInteractions=False,
        errorMatrix=False,
        verbose=False):
    print >> sys.stderr, "##### EvaluateInteractionXML #####"
    print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile
    # Class sets are used to convert the types to ids that the evaluator can use
    classSets = {}
    if EvaluatorClass.type == "binary":
        classSets["entity"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                    locked=True)
        classSets["interaction"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                         locked=True)
        negativeClassId = -1
    elif EvaluatorClass.type == "multiclass":
        classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False)
        classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False)
        negativeClassId = 1
    else:
        sys.exit("Unknown evaluator type")

    # Load corpus and make sentence graphs
    goldCorpusElements = None
    if goldCorpusFile != None:
        goldCorpusElements = SentenceGraph.loadCorpus(
            goldCorpusFile, parse, tokenization, False,
            removeIntersentenceInteractions)
    predictedCorpusElements = SentenceGraph.loadCorpus(
        inputCorpusFile, parse, tokenization, False,
        removeIntersentenceInteractions)

    # Compare the corpora and print results on screen
    return processCorpora(EvaluatorClass,
                          predictedCorpusElements,
                          goldCorpusElements,
                          target,
                          classSets,
                          negativeClassId,
                          entityMatchFunction,
                          errorMatrix=errorMatrix,
                          verbose=verbose)
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert classSet.getId("neg") == 1
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self._setDefaultParameters(["co_limits"])
        self.styles = self.getParameters(style)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
        self.phraseTypeCounts = {}
 def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     if style.find(",") != -1:
         style = style.split(",")
     self.styles = style
     
     self.negFrac = None
     self.posPairGaz = POSPairGazetteer()
     for s in style:
         if s.find("negFrac") != -1:      
             self.negFrac = float(s.split("_")[-1])
             print >> sys.stderr, "Downsampling negatives to", self.negFrac
             self.negRand = random.Random(15)
         elif s.find("posPairGaz") != -1:
             self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1])
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     if "graph_kernel" in self.styles:
         from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
         self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
     if "noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     if "ontology" in self.styles:
         self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     if "nodalida" in self.styles:
         self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
     #IF LOCAL
     if "bioinfer_limits" in self.styles:
         self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
         #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
     #ENDIF
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     if "random" in self.styles:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
Example #22
0
 def __init__(self, style=None, classSet=None, featureSet=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     self.styles = style
     self.timerBuildExamples = Timer(False)
     self.timerCrawl = Timer(False)
     self.timerCrawlPrecalc = Timer(False)
     self.timerMatrix = Timer(False)
     self.timerMatrixPrecalc = Timer(False)
Example #23
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self.styles = style
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
Example #25
0
    def __init__(self, examples=None, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"])

        self.classSet = classSet
        self.results = None
        self.internal = None
        if predictions != None:
            for example in examples:
                if example[3] != None:
                    print >> sys.stderr, "ChemProt Evaluator:"
                    self._calculateExamples(examples, predictions)
                else:
                    print >> sys.stderr, "No example extra info, skipping ChemProt evaluation"
                break
            self.internal = AveragingMultiClassEvaluator(
                examples, predictions, classSet)
            print >> sys.stderr, "AveragingMultiClassEvaluator:"
            print >> sys.stderr, self.internal.toStringConcise()
Example #26
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(
            SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse,
            SharedTaskEvaluator.tokenization)
        # Build interaction xml
        xml = BioTextExampleWriter.write(
            examples, predictions, SharedTaskEvaluator.corpusElements, None,
            SharedTaskEvaluator.ids + ".class_names",
            SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        # Convert to GENIA format
        gifxmlToGenia(xml,
                      SharedTaskEvaluator.geniaDir,
                      task=SharedTaskEvaluator.task,
                      verbose=False)
        # Use GENIA evaluation tool
        self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir,
                                          task=SharedTaskEvaluator.task,
                                          evaluations=["approximate"],
                                          verbose=False)
 def __init__(self, style):
     ExampleBuilder.__init__(self)
     self.featureBuilder = EdgeFeatureBuilder(self.featureSet)
     self.style = style
     if not "binary" in style:
         self.classSet = IdSet(1)
         assert( self.classSet.getId("neg") == 1 )
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert classSet.getId("neg") == 1
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >>sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features",
            "typed",
            "directed",
            "no_linear",
            "entities",
            "genia_limits",
            "noMasking",
            "maxFeatures",
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder

            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
Example #29
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features", "typed", "directed", "no_linear", "entities",
            "genia_limits", "noMasking", "maxFeatures"
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self._setDefaultParameters(["co_limits"])
        self.styles = self.getParameters(style)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
        self.phraseTypeCounts = {}
Example #31
0
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     
     self.counts = {}
     self.countsPerType = {}
     self.untypedCounts = {}
     self.tokenCounts = {}
Example #32
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
     
     self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
Example #33
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     global speculationWords
     
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     self.specWords, self.specWordStems = readWords(speculationWords) 
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         self.gazetteer=None
     self.styles = self.getParameters(style, {"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")})
Example #34
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(0)
        if featureSet == None:
            featureSet = IdSet(0)

        ExampleBuilder.__init__(self, classSet, featureSet)

        self.featureIds = self.featureSet
        self.labelIds = self.classSet

        self._setDefaultParameters([
            "directed", "undirected", "cutoff", "annotated_only",
            "all_positive", "wv", "epochs", "html", "autoencode", "lr",
            "patience"
        ])
        self.styles = self.getParameters(style)
        if self.styles["cutoff"]:
            self.styles["cutoff"] = int(self.styles["cutoff"])

        self.wvIndices = None
        self.embeddingMatrices = None
        if self.styles.get("wv") != None:
            indexPath = self.styles.get("wv") + "-indices.json.gz"
            if not os.path.exists(indexPath):
                indexPath = os.path.join(Settings.DATAPATH, "wv", indexPath)
            print >> sys.stderr, "Loading word vector indices from", indexPath
            with gzip.open(indexPath, "rt") as f:
                self.wvIndices = json.load(f)["indices"]
            self.embeddingMatrices = []

        self.dimMatrix = 32
        self.rangeMatrix = range(self.dimMatrix)
        self.featureMatrices = []
        self.labelMatrices = []
        self.tokenLists = []
 def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     #if "noAnnType" in self.styles:
     self.multiEdgeFeatureBuilder.noAnnType = True
     #if "noMasking" in self.styles:
     self.multiEdgeFeatureBuilder.maskNamedEntities = False
     #if "maxFeatures" in self.styles:
     self.multiEdgeFeatureBuilder.maximum = True
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder.useNonNameEntities = False
Example #36
0
 def __init__(self, classSet=None, featureSet=None):
     if(type(classSet) == types.StringType):
         self.classSet = IdSet(filename=classSet)
     else:
         self.classSet = classSet
     
     if(type(featureSet) == types.StringType):
         self.featureSet = IdSet(filename=featureSet)
     else:
         self.featureSet = featureSet
     
     self.featureTag = ""      
     self.exampleStats = ExampleStats()
     self.parse = None
     self.tokenization = None
     #self.idFileTag = None
     self.classIdFilename = None
     self.featureIdFilename = None
     
     self.styles = None
     self._defaultParameters = None
     self._parameterValueLimits = None
     self._setDefaultParameters(["sentenceLimit"])
Example #37
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)

        self.counts = {}
        self.countsPerType = {}
        self.untypedCounts = {}
        self.tokenCounts = {}
Example #38
0
    def __init__(self, classSet=None, featureSet=None):
        if (type(classSet) == types.StringType):
            self.classSet = IdSet(filename=classSet)
        else:
            self.classSet = classSet

        if (type(featureSet) == types.StringType):
            self.featureSet = IdSet(filename=featureSet)
        else:
            self.featureSet = featureSet

        self.featureTag = ""
        self.exampleStats = ExampleStats()
        self.parse = None
        self.tokenization = None
        #self.idFileTag = None
        self.classIdFilename = None
        self.featureIdFilename = None

        self.styles = {}
        self._defaultParameters = None
        self._parameterValueLimits = None
        self._setDefaultParameters(["sentenceLimit"])
        self.debug = False
Example #39
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        self.dataByClass = defaultdict(EvaluationData)

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
Example #40
0
def getClassSet(rows, classSet=None):
    from Core.IdSet import IdSet
    classNames = set()
    for row in rows:
        classNames.add(row["class"])
        classNames.add(row["prediction"])
    
    # In the case of multiclass, give integer id:s for the classes
    if classSet == None:
        classSet = IdSet()
        assert(not ("1" in classNames and "neg" in classNames))
        assert("1" in classNames or "neg" in classNames)
        if "1" in classNames:
            classSet.defineId("1",1)
        else:
            classSet.defineId("neg",1)
    for i in sorted(list(classNames)):
        if i != "1" and i != "neg":
            classSet.getId(i)
    return classSet
Example #41
0
 def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
     # Class ids
     #print classIds
     #print featureIds
     if classIds != None and os.path.exists(classIds):
         print >> sys.stderr, "Using predefined class names from", classIds
         classSet = IdSet(allowNewIds=allowNewIds)
         classSet.load(classIds)
     else:
         print >> sys.stderr, "No predefined class names"
         classSet = None
     # Feature ids
     if featureIds != None and os.path.exists(featureIds):
         print >> sys.stderr, "Using predefined feature names from", featureIds
         featureSet = IdSet(allowNewIds=allowNewIds)
         featureSet.load(featureIds)
     else:
         print >> sys.stderr, "No predefined feature names"
         featureSet = None
     return classSet, featureSet
Example #42
0
 def __init__(self, examples=None, predictions=None, classSet=None):
     if type(classSet) == types.StringType:  # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType:  # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:  # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     #self.examples = examples
     #self.predictions = predictions
     self.truePositives = 0
     self.falsePositives = 0
     self.trueNegatives = 0
     self.falseNegatives = 0
     self.precision = None
     self.recall = None
     self.fScore = None
     self.AUC = None
     self.type = "binary"
     if predictions != None:
         self._calculate(examples, predictions)
Example #43
0
 def __init__(self, examples, predictions=None, classSet=None):
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     
     corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization)
     # Build interaction xml
     xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization)
     xml = ix.splitMergedElements(xml, None)
     xml = ix.recalculateIds(xml, None, True)
     #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Convert to GENIA format
     STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2")
     #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False)
     # Use GENIA evaluation tool
     self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag)
     corpusElements = None
Example #44
0
def getClassSet(rows, classSet=None):
    from Core.IdSet import IdSet
    classNames = set()
    for row in rows:
        classNames.add(row["class"])
        classNames.add(row["prediction"])
    
    # In the case of multiclass, give integer id:s for the classes
    if classSet == None:
        classSet = IdSet()
        assert(not ("1" in classNames and "neg" in classNames))
        assert("1" in classNames or "neg" in classNames)
        if "1" in classNames:
            classSet.defineId("1",1)
        else:
            classSet.defineId("neg",1)
    for i in sorted(list(classNames)):
        if i != "1" and i != "neg":
            classSet.getId(i)
    return classSet
Example #45
0
 def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
     # Class ids
     #print classIds
     #print featureIds
     if classIds != None and os.path.exists(classIds):
         print >> sys.stderr, "Using predefined class names from", classIds
         classSet = IdSet(allowNewIds=allowNewIds)
         classSet.load(classIds)
     else:
         print >> sys.stderr, "No predefined class names"
         classSet = None
     # Feature ids
     if featureIds != None and os.path.exists(featureIds):
         print >> sys.stderr, "Using predefined feature names from", featureIds
         featureSet = IdSet(allowNewIds=allowNewIds)
         featureSet.load(featureIds)
     else:
         print >> sys.stderr, "No predefined feature names"
         featureSet = None
     return classSet, featureSet
Example #46
0
def compareExamples(examples1, examples2, features1, features2=None):
    ExampleUtils.readExamples(examples1)
    exampleIter1 = ExampleUtils.readExamples(examples1)
    exampleIter2 = ExampleUtils.readExamples(examples2)
    features1 = IdSet(filename=features1)
    if features2 != None:
        features2 = IdSet(filename=features2)
    else:
        features2 = features1
    # Compare feature sets
    if set(features1.Ids.keys()) != set(features2.Ids.keys()):
        print "Feature sets differ"
    # Compare examples
    counter = ProgressCounter(step=1)
    for e1, e2 in itertools.izip(exampleIter1, exampleIter2):
        counter.update()
        assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2))
        if e1[1] != e2[1]:
            print "Class differs"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
        f1 = getFeatureNames(e1, features1)
        f2 = getFeatureNames(e2, features2)
        f1Set = set(f1)
        f2Set = set(f2)
        f1Only = f1Set.difference(f2Set)
        f2Only = f2Set.difference(f1Set)
        if len(f1Only) > 0 or len(f2Only) > 0:
            print "Features differ"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
            if len(f1Only) > 0:
                print "  E1-only features:", f1Only
            if len(f2Only) > 0:
                print "  E2-only features:", f2Only
        else:
            assert len(f1) == len(f2)
            fCount = 0
            differ = False
            for feature1, feature2 in zip(f1, f2):
                #f1Id = features1.getId(feature1, createIfNotExist=False)
                #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation":
                #    print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id]
                if feature1 != feature2:
                    if not differ:
                        print "Feature order differs for example", e1[0]
                        differ = True
                    print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ",
                else:
                    f1Id = features1.getId(feature1, createIfNotExist=False)
                    f2Id = features2.getId(feature2, createIfNotExist=False)
                    f1Value = e1[2][f1Id]
                    f2Value = e2[2][f2Id]
                    if f1Value != f2Value:
                        if not differ:
                            print "Feature values differ", e1[0]
                            differ = True
                        print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ",
                fCount += 1              
            if differ:
                print
    counter.endUpdate()
Example #47
0
    def writeXML(self,
                 examples,
                 predictions,
                 corpus,
                 outputFile,
                 classSet=None,
                 parse=None,
                 tokenization=None,
                 goldCorpus=None,
                 exampleStyle=None,
                 structureAnalyzer=None):
        """
        Writes task 3 examples to interaction XML. Assumes task 3 classification
        is done with SVMMulticlass Classifier, used for two classes.
        """
        print >> sys.stderr, "Adding task 3 to Interaction XML"
        examples, predictions = self.loadExamples(examples, predictions)

        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        classIds = None
        if classSet != None:
            classIds = classSet.getIds()

        corpusTree = ETUtils.ETFromObj(corpus)
        corpusRoot = corpusTree.getroot()

        # Determine subtask
        task3Type = None
        for example in examples:
            assert example[3].has_key("t3type")
            task3Type = example[3]["t3type"]
            break
        if task3Type == None:
            if outputFile != None:
                print >> sys.stderr, "Writing corpus to", outputFile
                ETUtils.write(corpusRoot, outputFile)
            return corpusTree
        assert task3Type in ["multiclass", "speculation", "negation"]

        # Remove the task 3 subtask information if it already exists
        for entity in corpusRoot.getiterator("entity"):
            if task3Type == "multiclass":
                entity.set("speculation", "False")
                entity.set("negation", "False")
            elif task3Type == "speculation":
                entity.set("speculation", "False")
            else:  # task3Type == "negation"
                entity.set("negation", "False")

        specMap = {}
        negMap = {}
        for example, prediction in itertools.izip(examples, predictions):
            assert example[3]["xtype"] == "task3"
            if example[3]["t3type"] == "multiclass":
                if isinstance(prediction, dict):
                    encoded = prediction["prediction"]
                    predictedModifiers = [
                        classSet.getName(i) for i in range(len(encoded))
                        if encoded[i] == 1
                    ]
                else:
                    predictedClassName = classSet.getName(prediction[0])
                    predictedModifiers = ""
                    if predictedClassName != "neg":
                        predictedModifiers = predictedClassName.split("---")
                if "negation" in predictedModifiers:
                    assert not negMap.has_key(example[3]["entity"])
                    negMap[example[3]["entity"]] = (True, prediction)
                if "speculation" in predictedModifiers:
                    assert not specMap.has_key(example[3]["entity"])
                    specMap[example[3]["entity"]] = (True, prediction)
            else:
                if example[3]["t3type"] == "speculation":
                    map = specMap
                else:
                    map = negMap
                if prediction[0] != 1:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (True, prediction)
                else:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (False, prediction)

        for entity in corpusRoot.getiterator("entity"):
            eId = entity.get("id")
            if task3Type == "multiclass":
                if specMap.has_key(eId):
                    entity.set("speculation", str(specMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            specMap[eId][1], classSet, classIds))
                if negMap.has_key(eId):
                    entity.set("negation", str(negMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            negMap[eId][1], classSet, classIds))
            else:
                if task3Type == "speculation":
                    if specMap.has_key(eId):
                        entity.set("speculation", str(specMap[eId][0]))
                        entity.set(
                            "specConf",
                            self.getPredictionStrengthString(
                                specMap[eId][1], classSet, classIds, [""]))
                elif task3Type == "negation":
                    if negMap.has_key(eId):
                        entity.set("negation", str(negMap[eId][0]))
                        entity.set(
                            "negConf",
                            self.getPredictionStrengthString(
                                negMap[eId][1], classSet, classIds,
                                ["", "speculation"]))

        # Write corpus
        if outputFile != None:
            print >> sys.stderr, "Writing corpus to", outputFile
            ETUtils.write(corpusRoot, outputFile)
        return corpusTree
Example #48
0
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    
    defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples"))
    
    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names"))
    
    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
        newFeatures = {}
        for k,v in example[2].iteritems():
Example #49
0
 def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None):
     #print >> sys.stderr, "Writing output to Interaction XML"
     corpus = self.loadCorpus(corpus, parse, tokenization)
     if goldCorpus != None:
         goldCorpus = self.loadCorpus(corpus, parse, tokenization)
     examples, predictions = self.loadExamples(examples, predictions)
     
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     classIds = None
     if classSet != None:
         classIds = classSet.getIds()
         
     #counter = ProgressCounter(len(corpus.sentences), "Write Examples")
             
     exampleQueue = [] # One sentence's examples
     predictionsByExample = {}
     currentMajorId = None
     prevMajorIds = set()
     processedSentenceIds = set()
     xType = None
     
     count = 0
     for example in examples:
         count += 1
     assert count > 0
     progress = ProgressCounter(count, "Write Examples")
     
     for example, prediction in itertools.izip_longest(examples, predictions):
         assert example != None
         assert prediction != None
         majorId, minorId = example[0].rsplit(".x", 1)
         #if currentMajorId == "GENIA.d114.s9": print "Start"
         if majorId != currentMajorId: # new sentence
             if currentMajorId != None:
                 #if currentMajorId == "GENIA.d114.s9": print "JAA"
                 processedSentenceIds.add(currentMajorId)
                 sentenceObject = corpus.sentencesById[currentMajorId]
                 goldSentence = None
                 if goldCorpus != None:
                     goldSentence = goldCorpus.sentencesById[currentMajorId]
                 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue
                 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
             exampleQueue = []
             predictionsByExample = {}
             prevMajorIds.add(currentMajorId)
             assert majorId not in prevMajorIds, majorId
             currentMajorId = majorId 
         exampleQueue.append(example) # queue example
         predictionsByExample[example[0]] = prediction
         assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType)
     
     # Process what is still in queue
     if currentMajorId != None:
         processedSentenceIds.add(currentMajorId)
         sentenceObject = corpus.sentencesById[currentMajorId]
         goldSentence = None
         if goldCorpus != None:
             goldSentence = goldCorpus.sentencesById[currentMajorId]
         self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue
         progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
         exampleQueue = []
         predictionsByExample = {}
     
     # Process sentences with no examples (e.g. to clear interactions)
     for sentenceId in sorted(corpus.sentencesById.keys()):
         if sentenceId not in processedSentenceIds:
             sentenceObject = corpus.sentencesById[sentenceId]
             goldSentence = None
             if goldCorpus != None:
                 goldSentence = goldCorpus.sentencesById[currentMajorId]
             self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence)
     
     # Print statistics
     if len(self.counts) > 0:
         print >> sys.stderr, self.counts
         self.counts = defaultdict(int)
 
     # Write corpus
     if outputFile != None:
         print >> sys.stderr, "Writing corpus to", outputFile
         ETUtils.write(corpus.rootElement, outputFile)
     return corpus.tree
Example #50
0
def readARFF(filename):
    featureSet = IdSet(1)
    classSet = IdSet(0)
    f = open(filename,"rt")
    inData = False
    lines = f.readlines()
    counter = ProgressCounter(len(lines),"ARFFLine")
    examples = []
    for line in lines:
        counter.update(string="Processing line " + str(counter.current + 1) + ": ")
        line = line.strip()
        if len(line) == 0 or line[0] == "%":
            continue
        elif line[0] == "@":
            #print line
            category = line.split()[0].lower()
            if category == "@attribute":
                category, name, type = line.split()
                assert(not inData)
                if name.lower() == "class":
                    name = name.lower()
                    classNames = type[1:-1].split(",")
                    assert(len(classNames)==2)
                    classSet.defineId(classNames[0].strip(),1)
                    classSet.defineId(classNames[1].strip(),-1)
                featureSet.getId(name)
            elif category.lower() == "@relation":
                assert(not inData)
            elif category == "@data":
                inData = True
        else:
            assert(inData)
            count = 1
            features = {}
            for column in line.split(","):
                if featureSet.getName(count) != "class":
                    features[count] = float(column)
                else:
                    classId = classSet.getId(column, False)
                    assert(classId != None)
                count += 1
            exampleCount = str(len(examples))
            exampleId = "BreastCancer.d" + exampleCount + ".s0.x0"
            examples.append([exampleId,classId,features,{}])
                    
    return examples
Example #51
0
 def __init__(self, style=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
     
     # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures
     self._setDefaultParameters([
         "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features",
         "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
         "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features",
         "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", 
         "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", 
         "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
         "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only",
         "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", 
         "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities",
         "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra",
         "entity_extra"])
     self.styles = self.getParameters(style)
     #if style == None: # no parameters given
     #    style["typed"] = style["directed"] = style["headsOnly"] = True
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles)
     # NOTE Temporarily re-enabling predicted range
     #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
     if self.styles["graph_kernel"]:
         from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
         self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
     if self.styles["noAnnType"]:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if self.styles["mask_nodes"]:
         self.multiEdgeFeatureBuilder.maskNamedEntities = True
     else:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if not self.styles["limit_features"]:
         self.multiEdgeFeatureBuilder.maximum = True
     if self.styles["genia_task1"]:
         self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     if self.styles["ontology"]:
         self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     if self.styles["ontobiotope_features"]:
         self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
     if self.styles["nodalida"]:
         self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
     if self.styles["bacteria_renaming"]:
         self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
     if not self.styles["no_trigger_features"]:
         self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles)
         self.triggerFeatureBuilder.useNonNameEntities = True
         if self.styles["noAnnType"]:
             self.triggerFeatureBuilder.noAnnType = True
         if self.styles["genia_task1"]:
             self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
         #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
     if self.styles["rel_features"]:
         self.relFeatureBuilder = RELFeatureBuilder(featureSet)
     if self.styles["drugbank_features"]:
         self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
     if self.styles["evex"]:
         self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
     if self.styles["wordnet"]:
         self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
     if self.styles["wordvector"]:
         self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles)
     if self.styles["giuliano"]:
         self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
     self.types = types
     if self.styles["random"]:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"

if __name__=="__main__":
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
        
    from optparse import OptionParser # For using command line options
    optparser = OptionParser()
    optparser.add_option("-i", "--ids", default=None, dest="ids", help="")
    optparser.add_option("-e", "--examples", default=None, dest="examples", help="")
    optparser.add_option("-j", "--idOutput", default=None, dest="idOutput", help="")
    optparser.add_option("-o", "--output", default=None, dest="output", help="")
    optparser.add_option("-w", "--weights", default=None, dest="weights", help="")
    optparser.add_option("-c", "--cutoff", type="float", default=10.0, dest="cutoff", help="")
    (options, args) = optparser.parse_args()

    #classIds = IdSet(filename="/usr/share/biotext/GeniaChallenge/extension-data/genia/edge-examples/genia-edge-ids.class_names")
    featureIds = IdSet(filename=options.ids)
    weightFeatures = readWeights(options.weights, options.cutoff)
    polynomizeExamples(options.examples, options.output, weightFeatures, featureIds)
    featureIds.write(options.idOutput)
Example #53
0
class ExampleBuilder:
    structureAnalyzer = None
    """
    ExampleBuilder is the abstract base class for specialized example builders.
    Example builders take some data and convert it to examples usable by e.g. SVMs.
    An example builder writes three files, an example-file (in extended Joachim's
    SVM format) and .class_names and .feature_names files, which contain the names
    for the class and feature id-numbers. An example builder can also be given
    pre-existing sets of class and feature ids (optionally in files) so that the
    generated examples are consistent with other, previously generated examples.
    """
    def __init__(self, classSet=None, featureSet=None):
        if (type(classSet) == types.StringType):
            self.classSet = IdSet(filename=classSet)
        else:
            self.classSet = classSet

        if (type(featureSet) == types.StringType):
            self.featureSet = IdSet(filename=featureSet)
        else:
            self.featureSet = featureSet

        self.featureTag = ""
        self.exampleStats = ExampleStats()
        self.parse = None
        self.tokenization = None
        #self.idFileTag = None
        self.classIdFilename = None
        self.featureIdFilename = None

        self.styles = {}
        self._defaultParameters = None
        self._parameterValueLimits = None
        self._setDefaultParameters(["sentenceLimit"])
        self.debug = False

    def hasStyle(self, style):
        return style in self.styles and not self.styles[style]

    def _setDefaultParameters(self, defaults=None, valueLimits=None):
        # Initialize
        if self._defaultParameters == None:
            self._defaultParameters = {}
        if self._parameterValueLimits == None:
            self._parameterValueLimits = {}
        newParameters = Utils.Parameters.get({},
                                             defaults,
                                             valueLimits=valueLimits)
        self._defaultParameters.update(newParameters)
        if valueLimits != None:
            self._parameterValueLimits.update(valueLimits)

    def getParameters(self, parameters):
        return Utils.Parameters.get(parameters,
                                    defaults=self._defaultParameters,
                                    valueLimits=self._parameterValueLimits)

    def setFeature(self, name, value):
        self.features[self.featureSet.getId(self.featureTag + name)] = value

    def getElementCounts(self, filename):
        print >> sys.stderr, "Counting elements:",
        if filename.endswith(".gz"):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename, "rt")
        counts = {"documents": 0, "sentences": 0}
        for line in f:
            if "<document" in line:
                counts["documents"] += 1
            elif "<sentence" in line:
                counts["sentences"] += 1
        f.close()
        print >> sys.stderr, counts
        return counts

    def saveIds(self):
        if self.classIdFilename != None:
            print >> sys.stderr, "Saving class names to", self.classIdFilename
            self.classSet.write(self.classIdFilename)
        else:
            print >> sys.stderr, "Class names not saved"
        if self.featureIdFilename != None:
            print >> sys.stderr, "Saving feature names to", self.featureIdFilename
            self.featureSet.write(self.featureIdFilename)
        else:
            print >> sys.stderr, "Feature names not saved"

    def processCorpus(self,
                      input,
                      output,
                      gold=None,
                      append=False,
                      allowNewIds=True,
                      structureAnalyzer=None):
        # Create intermediate paths if needed
        if os.path.dirname(output) != "" and not os.path.exists(
                os.path.dirname(output)):
            os.makedirs(os.path.dirname(output))
        # Open output file
        openStyle = "wt"
        if append:
            #print "Appending examples"
            openStyle = "at"
        if output.endswith(".gz"):
            outfile = gzip.open(output, openStyle)
        else:
            outfile = open(output, openStyle)

        # Build examples
        self.exampleCount = 0
        if type(input) in types.StringTypes:  # Entered here - Mu
            self.elementCounts = self.getElementCounts(input)
            if self.elementCounts["sentences"] > 0:  # Entered here, 1448 - Mu
                self.progress = ProgressCounter(
                    self.elementCounts["sentences"], "Build examples")
            else:
                self.elementCounts = None
                self.progress = ProgressCounter(None, "Build examples")
        else:
            self.elementCounts = None
            self.progress = ProgressCounter(None, "Build examples")
        # pdb.set_trace()

        # This line generates log below:(getSentences function generates the first 2 lines)
        # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113)
        # Skipped 381 duplicate interaction edges in SentenceGraphs
        # Defining predicted value range: None - Mu
        self.calculatePredictedRange(
            self.getSentences(input, self.parse, self.tokenization)
        )  # self.parse: mccc; self.tokenization: None

        removeIntersentenceInteractions = True
        if "keep_intersentence" in self.styles and self.styles[
                "keep_intersentence"]:
            print >> sys.stderr, "Keeping intersentence interactions for input corpus"
            removeIntersentenceInteractions = False  # this is True  - Mu
        inputIterator = getCorpusIterator(
            input,
            None,
            self.parse,
            self.tokenization,
            removeIntersentenceInteractions=removeIntersentenceInteractions)

        # pdb.set_trace()
        #goldIterator = []
        if gold != None:  # Entered here - Mu
            removeGoldIntersentenceInteractions = True
            if "keep_intersentence_gold" in self.styles and self.styles[
                    "keep_intersentence_gold"]:
                print >> sys.stderr, "Keeping intersentence interactions for gold corpus"
                removeGoldIntersentenceInteractions = False  # this is False - Mu
            goldIterator = getCorpusIterator(
                gold,
                None,
                self.parse,
                self.tokenization,
                removeIntersentenceInteractions=
                removeGoldIntersentenceInteractions)
            for inputSentences, goldSentences in itertools.izip_longest(
                    inputIterator, goldIterator, fillvalue=None):
                assert inputSentences != None
                assert goldSentences != None
                # pdb.set_trace()
                # see the documentation of function processSentence() in this script
                # inputSentences[1].sentence is the unmerged version
                # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph,
                # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu
                self.processDocument(inputSentences,
                                     goldSentences,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        else:
            for inputSentences in inputIterator:
                self.processDocument(inputSentences,
                                     None,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        outfile.close()
        self.progress.endUpdate()

        # Show statistics
        print >> sys.stderr, "Examples built:", self.exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        print >> sys.stderr, "Classes:", len(self.classSet.getNames())
        print >> sys.stderr, "Style:", Utils.Parameters.toString(
            self.getParameters(self.styles))
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()

        # Save Ids
        if allowNewIds:
            self.saveIds()

    def processDocument(self,
                        sentences,
                        goldSentences,
                        outfile,
                        structureAnalyzer=None):
        #calculatePredictedRange(self, sentences)
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = None
            if goldSentences != None:
                goldSentence = goldSentences[i]
            self.progress.update(
                1, "Building examples (" + sentence.sentence.get("id") + "): ")
            self.processSentence(sentence,
                                 outfile,
                                 goldSentence,
                                 structureAnalyzer=structureAnalyzer)

    def processSentence(self,
                        sentence,
                        outfile,
                        goldSentence=None,
                        structureAnalyzer=None):
        '''
        sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance
        sentence.sentence: Element 'sentence' in the xml file
        '''
        # pdb.set_trace()
        # Process filtering rules
        # does NOT entered here since self.styles["sentenceLimit"] is None - Mu
        if "sentenceLimit" in self.styles and self.styles[
                "sentenceLimit"]:  # Rules for limiting which sentences to process
            # Get the rule list
            limitRules = self.styles["sentenceLimit"]
            if type(limitRules) in types.StringTypes:
                limitRules = [limitRules]
            # Get the list of sentence element attribute names
            sentenceElement = sentence.sentence
            sentenceAttributes = sorted(sentenceElement.attrib.keys())
            # Filter sentences based on matching rules to their attribute values
            for rule in limitRules:
                for sentAttr in sentenceAttributes:
                    # Rule are of the form "attr.value" where "attr" is the name
                    # of the attribute to match, and "value" a substring within
                    # that attribute
                    if rule.startswith(sentAttr +
                                       "."):  # rule matches the attribute
                        value = rule.split(
                            ".", 1)[-1]  # get the value part of the rule
                        if value not in sentenceElement.get(
                                sentAttr
                        ):  # rule value must be a substring of the attribute value
                            return  # discard all sentences that do not match all rules
        # Process the sentence
        if sentence.sentenceGraph != None:
            goldGraph = None
            if goldSentence != None:
                goldGraph = goldSentence.sentenceGraph
            # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer)
            # self.exampleCount += c
            self.exampleCount += self.buildExamplesFromGraph(
                sentence.sentenceGraph,
                outfile,
                goldGraph,
                structureAnalyzer=structureAnalyzer)
        # return sentenceGraph_return, argCombinations_return

    @classmethod
    def run(cls,
            input,
            output,
            parse,
            tokenization,
            style,
            classIds=None,
            featureIds=None,
            gold=None,
            append=False,
            allowNewIds=True,
            structureAnalyzer=None,
            debug=False):
        print >> sys.stderr, "Running", cls.__name__
        print >> sys.stderr, "  input:", input
        if gold != None:
            print >> sys.stderr, "  gold:", gold
        print >> sys.stderr, "  output:", output, "(append:", str(append) + ")"
        print >> sys.stderr, "  add new class/feature ids:", allowNewIds
        if not isinstance(style, types.StringTypes):
            style = Utils.Parameters.toString(style)
        print >> sys.stderr, "  style:", style
        if tokenization == None:
            print >> sys.stderr, "  parse:", parse
        else:
            print >> sys.stderr, "  parse:", parse + ", tokenization:", tokenization
        classSet, featureSet = cls.getIdSets(
            classIds, featureIds, allowNewIds)  #cls.getIdSets(idFileTag)
        builder = cls(style=style, classSet=classSet, featureSet=featureSet)
        builder.debug = debug
        #builder.idFileTag = idFileTag
        builder.classIdFilename = classIds
        builder.featureIdFilename = featureIds
        builder.parse = parse
        builder.tokenization = tokenization
        builder.processCorpus(input,
                              output,
                              gold,
                              append=append,
                              allowNewIds=allowNewIds,
                              structureAnalyzer=structureAnalyzer)
        return builder

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        raise NotImplementedError

    def definePredictedValueRange(self, sentences, elementName):
        pass

    def getPredictedValueRange(self):
        return None

    @classmethod
    def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
        # Class ids
        #print classIds
        #print featureIds
        if classIds != None and os.path.exists(classIds):
            print >> sys.stderr, "Using predefined class names from", classIds
            classSet = IdSet(allowNewIds=allowNewIds)
            classSet.load(classIds)
        else:
            print >> sys.stderr, "No predefined class names"
            classSet = None
        # Feature ids
        if featureIds != None and os.path.exists(featureIds):
            print >> sys.stderr, "Using predefined feature names from", featureIds
            featureSet = IdSet(allowNewIds=allowNewIds)
            featureSet.load(featureIds)
        else:
            print >> sys.stderr, "No predefined feature names"
            featureSet = None
        return classSet, featureSet


#        if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"):
#            print >> sys.stderr, "Using predefined class and feature names"
#            featureSet = IdSet()
#            featureSet.load(idFileTag + ".feature_names.gz")
#            classSet = IdSet()
#            classSet.load(idFileTag + ".class_names")
#            return classSet, featureSet
#        else:
#            print >> sys.stderr, "No predefined class or feature-names"
#            if idFileTag != None:
#                assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag
#                assert(not os.path.exists(idFileTag + ".class_names")), idFileTag
#            return None, None

    def getSentences(self, input, parse, tokenization, removeNameInfo=False):
        # pdb.set_trace()
        # input is the path to the corpus xml file
        if type(input) != types.ListType:  # Program entered here - Mu
            # Load corpus and make sentence graphs
            # pdb.set_trace()
            corpusElements = Core.SentenceGraph.loadCorpus(
                input, parse, tokenization, removeNameInfo=removeNameInfo)
            sentences = []
            for sentence in corpusElements.sentences:
                if sentence.sentenceGraph != None:  # required for event detection
                    sentences.append([sentence.sentenceGraph, None])
            return sentences
        else:  # assume input is already a list of sentences
            assert (removeNameInfo == False)
            return input

    def calculatePredictedRange(self, sentences):
        print >> sys.stderr, "Defining predicted value range:",
        sentenceElements = []
        for sentence in sentences:
            sentenceElements.append(sentence[0].sentenceElement)
        self.definePredictedValueRange(sentenceElements, "entity")
        print >> sys.stderr, self.getPredictedValueRange()
Example #54
0
 def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if type(parameters) == types.StringType:
         parameters = splitParameters(parameters)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = examples
         examples = Example.readExamples(examples,False)
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],"classifier/model")
             del parameters["predefined"]
     # Read model
     if modelPath == None:
         modelPath = "model-multilabel"
     classModels = {}
     if modelPath.endswith(".gz"):
         f = gzip.open(modelPath, "rt")
     else:
         f = open(modelPath, "rt")
     thresholds = {}
     for line in f:
         key, value, threshold = line.split()
         classModels[key] = value
         if threshold != "None":
             thresholds[key] = float(threshold)
         else:
             thresholds[key] = 0.0
     f.close()
     mergedPredictions = []
     if type(classIds) == types.StringType:
         classIds = IdSet(filename=classIds)
     #print classModels
     print "Thresholds", thresholds
     classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify"
     print parameters
     if "classifier" in parameters and "svmperf" in parameters["classifier"]:
         classifierBin = Settings.SVMPerfDir+"/svm_perf_classify"
         parameters = copy.copy(parameters)
         del parameters["classifier"]
     for className in classIds.getNames():
         if className != "neg" and not "---" in className:
             classId = classIds.getId(className)
             if thresholds[str(className)] != 0.0:
                 print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)]
             else:
                 print >> sys.stderr, "Classifying", className
             args = [classifierBin]
             #self.__addParametersToSubprocessCall(args, parameters)
             classOutput = "predictions" + ".cls-" + className
             logFile = open("svmmulticlass" + ".cls-" + className + ".log","at")
             args += [testPath, classModels[str(className)], classOutput]
             print args
             subprocess.call(args, stdout = logFile, stderr = logFile)
             cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)])
     print >> sys.stderr, timer.toString()
     
     predFileName = output
     f = open(predFileName, "wt")
     for mergedPred in mergedPredictions:
         if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
             mergedPred[0].remove("1")
         mergedPred[1] = str(mergedPred[1])
         mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
         f.write(" ".join(mergedPred) + "\n")
     f.close()
     
     return mergedPredictions
Example #55
0
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures",
            "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features",
            "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", 
            "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", 
            "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
            "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only",
            "entity_type"
        ])
        if style == None: # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
			self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)