def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True defaultParameters["no_arg_count_upper_limit"] = False self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=["typed", "directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer( loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style): ExampleBuilder.__init__(self) self.featureBuilder = EdgeFeatureBuilder(self.featureSet) self.style = style if not "binary" in style: self.classSet = IdSet(1) assert (self.classSet.getId("neg") == 1)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, ["trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert classSet.getId("neg") == 1 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) # if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert self.pathLengths == None self.types = types
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types
def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def devectorizePredictions(self, predictions): """ Converts a dense Numpy array of [examples][width][height][features] into the corresponding Python list matrices where features are stored in a key-value dictionary. """ targetIds = IdSet(filename=self.model.get(self.tag+"ids.classes"), locked=True) dimMatrix = int(self.model.getStr("dimMatrix")) dimLabels = int(self.model.getStr("dimLabels")) predictions = reshape(predictions, (predictions.shape[0], dimMatrix, dimMatrix, dimLabels)) rangeMatrix = range(dimMatrix) labels = np.argmax(predictions, axis=-1) values = np.max(predictions, axis=-1) minValue = np.min(values) maxValue = np.max(values) valRange = maxValue - minValue print "MINMAX", minValue, maxValue devectorized = [] for exampleIndex in range(predictions.shape[0]): #print predictions[exampleIndex] devectorized.append([]) for i in rangeMatrix: devectorized[-1].append([]) for j in rangeMatrix: features = {} devectorized[-1][-1].append(features) maxFeature = labels[exampleIndex][i][j] predValue = predictions[exampleIndex][i][j][maxFeature] features[targetIds.getName(maxFeature)] = float(predValue) features["color"] = self.getColor((predValue - minValue) / valRange) return devectorized
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): global speculationWords if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() self.specWords, self.specWordStems = readWords(speculationWords) ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: self.gazetteer = None self.styles = self.getParameters(style, { "classification": "multiclass", "speculation_words": True }, {"classification": ("multiclass", "speculation", "negation")})
def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert( classSet.getId("neg") == 1 ) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "names", "build_for_nameless", "skip_for_nameless", "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", "phospho", "drugbank_features", "ddi13_features", "metamap", "only_types", "ontobiotope_features", "bb_spans", "w2v", "no_context"]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens() #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["w2v"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet)
def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) if gazetteer != None: print >> sys.stderr, "Loading gazetteer from", gazetteer self.gazetteer=Gazetteer.loadGztr(gazetteer) else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.pathGazetteer=None self.pathGazetteerDependencies = None self.pathGazetteerPairs = None if pathGazetteer != None: print >> sys.stderr, "Loading path gazetteer from", pathGazetteer self.pathGazetteer=PathGazetteer.load(pathGazetteer) self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer) self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer) else: print >> sys.stderr, "No path gazetteer loaded" ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.negFrac = negFrac print >> sys.stderr, "Downsampling negatives to", negFrac self.negRand = random.Random() self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if True:#"noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.eventsByOrigId = {} self.headTokensByOrigId = {} self.interSentenceEvents = set() self.examplesByEventOrigId = {} self.skippedByType = {} self.skippedByTypeAndReason = {} self.builtByType = {} self.gazMatchCache = {}
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert (classSet.getId("neg") == 1) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self._setDefaultParameters([ "rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "build_for_nameless", "pos_only", "all_tokens", "names", "pos_pairs", "linear_ngrams", "phospho" ]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens( ) #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def addExamples(exampleFile, predictionFile, classFile, matrix): classSet = IdSet(filename=classFile) f = open(predictionFile, "rt") for example in ExampleUtils.readExamples(exampleFile, False): pred = int(f.readline().split()[0]) predClasses = classSet.getName(pred) goldClasses = classSet.getName(example[1]) for predClass in predClasses.split("---"): for goldClass in goldClasses.split("---"): matrix[predClass][goldClass] matrix[goldClass][predClass] += 1 f.close()
def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False, errorMatrix=False, verbose=False): print >> sys.stderr, "##### EvaluateInteractionXML #####" print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile # Class sets are used to convert the types to ids that the evaluator can use classSets = {} if EvaluatorClass.type == "binary": classSets["entity"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) classSets["interaction"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) negativeClassId = -1 elif EvaluatorClass.type == "multiclass": classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False) classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False) negativeClassId = 1 else: sys.exit("Unknown evaluator type") # Load corpus and make sentence graphs goldCorpusElements = None if goldCorpusFile != None: goldCorpusElements = SentenceGraph.loadCorpus( goldCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) predictedCorpusElements = SentenceGraph.loadCorpus( inputCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) # Compare the corpora and print results on screen return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=errorMatrix, verbose=verbose)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert classSet.getId("neg") == 1 if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self._setDefaultParameters(["co_limits"]) self.styles = self.getParameters(style) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False self.phraseTypeCounts = {}
def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.timerBuildExamples = Timer(False) self.timerCrawl = Timer(False) self.timerCrawlPrecalc = Timer(False) self.timerMatrix = Timer(False) self.timerMatrixPrecalc = Timer(False)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.styles = style
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]) self.classSet = classSet self.results = None self.internal = None if predictions != None: for example in examples: if example[3] != None: print >> sys.stderr, "ChemProt Evaluator:" self._calculateExamples(examples, predictions) else: print >> sys.stderr, "No example extra info, skipping ChemProt evaluation" break self.internal = AveragingMultiClassEvaluator( examples, predictions, classSet) print >> sys.stderr, "AveragingMultiClassEvaluator:" print >> sys.stderr, self.internal.toStringConcise()
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus( SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write( examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids + ".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
def __init__(self, style): ExampleBuilder.__init__(self) self.featureBuilder = EdgeFeatureBuilder(self.featureSet) self.style = style if not "binary" in style: self.classSet = IdSet(1) assert( self.classSet.getId("neg") == 1 )
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert classSet.getId("neg") == 1 if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >>sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures", ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures" ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self._setDefaultParameters(["co_limits"]) self.styles = self.getParameters(style) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False self.phraseTypeCounts = {}
def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.counts = {} self.countsPerType = {} self.untypedCounts = {} self.tokenCounts = {}
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.styles = style self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): global speculationWords if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() self.specWords, self.specWordStems = readWords(speculationWords) ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: self.gazetteer=None self.styles = self.getParameters(style, {"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")})
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(0) if featureSet == None: featureSet = IdSet(0) ExampleBuilder.__init__(self, classSet, featureSet) self.featureIds = self.featureSet self.labelIds = self.classSet self._setDefaultParameters([ "directed", "undirected", "cutoff", "annotated_only", "all_positive", "wv", "epochs", "html", "autoencode", "lr", "patience" ]) self.styles = self.getParameters(style) if self.styles["cutoff"]: self.styles["cutoff"] = int(self.styles["cutoff"]) self.wvIndices = None self.embeddingMatrices = None if self.styles.get("wv") != None: indexPath = self.styles.get("wv") + "-indices.json.gz" if not os.path.exists(indexPath): indexPath = os.path.join(Settings.DATAPATH, "wv", indexPath) print >> sys.stderr, "Loading word vector indices from", indexPath with gzip.open(indexPath, "rt") as f: self.wvIndices = json.load(f)["indices"] self.embeddingMatrices = [] self.dimMatrix = 32 self.rangeMatrix = range(self.dimMatrix) self.featureMatrices = [] self.labelMatrices = [] self.tokenLists = []
def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) #if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True #if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False #if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, classSet=None, featureSet=None): if(type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if(type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = None self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"])
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.counts = {} self.countsPerType = {} self.untypedCounts = {} self.tokenCounts = {}
def __init__(self, classSet=None, featureSet=None): if (type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if (type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = {} self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet self.dataByClass = defaultdict(EvaluationData) #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def getClassSet(rows, classSet=None): from Core.IdSet import IdSet classNames = set() for row in rows: classNames.add(row["class"]) classNames.add(row["prediction"]) # In the case of multiclass, give integer id:s for the classes if classSet == None: classSet = IdSet() assert(not ("1" in classNames and "neg" in classNames)) assert("1" in classNames or "neg" in classNames) if "1" in classNames: classSet.defineId("1",1) else: classSet.defineId("neg",1) for i in sorted(list(classNames)): if i != "1" and i != "neg": classSet.getId(i) return classSet
def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) #self.examples = examples #self.predictions = predictions self.truePositives = 0 self.falsePositives = 0 self.trueNegatives = 0 self.falseNegatives = 0 self.precision = None self.recall = None self.fScore = None self.AUC = None self.type = "binary" if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization) xml = ix.splitMergedElements(xml, None) xml = ix.recalculateIds(xml, None, True) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2") #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag) corpusElements = None
def compareExamples(examples1, examples2, features1, features2=None): ExampleUtils.readExamples(examples1) exampleIter1 = ExampleUtils.readExamples(examples1) exampleIter2 = ExampleUtils.readExamples(examples2) features1 = IdSet(filename=features1) if features2 != None: features2 = IdSet(filename=features2) else: features2 = features1 # Compare feature sets if set(features1.Ids.keys()) != set(features2.Ids.keys()): print "Feature sets differ" # Compare examples counter = ProgressCounter(step=1) for e1, e2 in itertools.izip(exampleIter1, exampleIter2): counter.update() assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2)) if e1[1] != e2[1]: print "Class differs" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) f1 = getFeatureNames(e1, features1) f2 = getFeatureNames(e2, features2) f1Set = set(f1) f2Set = set(f2) f1Only = f1Set.difference(f2Set) f2Only = f2Set.difference(f1Set) if len(f1Only) > 0 or len(f2Only) > 0: print "Features differ" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) if len(f1Only) > 0: print " E1-only features:", f1Only if len(f2Only) > 0: print " E2-only features:", f2Only else: assert len(f1) == len(f2) fCount = 0 differ = False for feature1, feature2 in zip(f1, f2): #f1Id = features1.getId(feature1, createIfNotExist=False) #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation": # print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id] if feature1 != feature2: if not differ: print "Feature order differs for example", e1[0] differ = True print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ", else: f1Id = features1.getId(feature1, createIfNotExist=False) f2Id = features2.getId(feature2, createIfNotExist=False) f1Value = e1[2][f1Id] f2Value = e2[2][f2Id] if f1Value != f2Value: if not differ: print "Feature values differ", e1[0] differ = True print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ", fCount += 1 if differ: print counter.endUpdate()
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None, structureAnalyzer=None): """ Writes task 3 examples to interaction XML. Assumes task 3 classification is done with SVMMulticlass Classifier, used for two classes. """ print >> sys.stderr, "Adding task 3 to Interaction XML" examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Determine subtask task3Type = None for example in examples: assert example[3].has_key("t3type") task3Type = example[3]["t3type"] break if task3Type == None: if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree assert task3Type in ["multiclass", "speculation", "negation"] # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "multiclass": entity.set("speculation", "False") entity.set("negation", "False") elif task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "multiclass": if isinstance(prediction, dict): encoded = prediction["prediction"] predictedModifiers = [ classSet.getName(i) for i in range(len(encoded)) if encoded[i] == 1 ] else: predictedClassName = classSet.getName(prediction[0]) predictedModifiers = "" if predictedClassName != "neg": predictedModifiers = predictedClassName.split("---") if "negation" in predictedModifiers: assert not negMap.has_key(example[3]["entity"]) negMap[example[3]["entity"]] = (True, prediction) if "speculation" in predictedModifiers: assert not specMap.has_key(example[3]["entity"]) specMap[example[3]["entity"]] = (True, prediction) else: if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (True, prediction) else: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (False, prediction) for entity in corpusRoot.getiterator("entity"): eId = entity.get("id") if task3Type == "multiclass": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds)) if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds)) else: if task3Type == "speculation": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "specConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds, [""])) elif task3Type == "negation": if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "negConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds, ["", "speculation"])) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree
import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1])) newFeatures = {} for k,v in example[2].iteritems():
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None): #print >> sys.stderr, "Writing output to Interaction XML" corpus = self.loadCorpus(corpus, parse, tokenization) if goldCorpus != None: goldCorpus = self.loadCorpus(corpus, parse, tokenization) examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() #counter = ProgressCounter(len(corpus.sentences), "Write Examples") exampleQueue = [] # One sentence's examples predictionsByExample = {} currentMajorId = None prevMajorIds = set() processedSentenceIds = set() xType = None count = 0 for example in examples: count += 1 assert count > 0 progress = ProgressCounter(count, "Write Examples") for example, prediction in itertools.izip_longest(examples, predictions): assert example != None assert prediction != None majorId, minorId = example[0].rsplit(".x", 1) #if currentMajorId == "GENIA.d114.s9": print "Start" if majorId != currentMajorId: # new sentence if currentMajorId != None: #if currentMajorId == "GENIA.d114.s9": print "JAA" processedSentenceIds.add(currentMajorId) sentenceObject = corpus.sentencesById[currentMajorId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") exampleQueue = [] predictionsByExample = {} prevMajorIds.add(currentMajorId) assert majorId not in prevMajorIds, majorId currentMajorId = majorId exampleQueue.append(example) # queue example predictionsByExample[example[0]] = prediction assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType) # Process what is still in queue if currentMajorId != None: processedSentenceIds.add(currentMajorId) sentenceObject = corpus.sentencesById[currentMajorId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ") exampleQueue = [] predictionsByExample = {} # Process sentences with no examples (e.g. to clear interactions) for sentenceId in sorted(corpus.sentencesById.keys()): if sentenceId not in processedSentenceIds: sentenceObject = corpus.sentencesById[sentenceId] goldSentence = None if goldCorpus != None: goldSentence = goldCorpus.sentencesById[currentMajorId] self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence) # Print statistics if len(self.counts) > 0: print >> sys.stderr, self.counts self.counts = defaultdict(int) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpus.rootElement, outputFile) return corpus.tree
def readARFF(filename): featureSet = IdSet(1) classSet = IdSet(0) f = open(filename,"rt") inData = False lines = f.readlines() counter = ProgressCounter(len(lines),"ARFFLine") examples = [] for line in lines: counter.update(string="Processing line " + str(counter.current + 1) + ": ") line = line.strip() if len(line) == 0 or line[0] == "%": continue elif line[0] == "@": #print line category = line.split()[0].lower() if category == "@attribute": category, name, type = line.split() assert(not inData) if name.lower() == "class": name = name.lower() classNames = type[1:-1].split(",") assert(len(classNames)==2) classSet.defineId(classNames[0].strip(),1) classSet.defineId(classNames[1].strip(),-1) featureSet.getId(name) elif category.lower() == "@relation": assert(not inData) elif category == "@data": inData = True else: assert(inData) count = 1 features = {} for column in line.split(","): if featureSet.getName(count) != "class": features[count] = float(column) else: classId = classSet.getId(column, False) assert(classId != None) count += 1 exampleCount = str(len(examples)) exampleId = "BreastCancer.d" + exampleCount + ".s0.x0" examples.append([exampleId,classId,features,{}]) return examples
def __init__(self, style=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures self._setDefaultParameters([ "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features", "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features", "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only", "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities", "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra", "entity_extra"]) self.styles = self.getParameters(style) #if style == None: # no parameters given # style["typed"] = style["directed"] = style["headsOnly"] = True self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["mask_nodes"]: self.multiEdgeFeatureBuilder.maskNamedEntities = True else: self.multiEdgeFeatureBuilder.maskNamedEntities = False if not self.styles["limit_features"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["noAnnType"]: self.triggerFeatureBuilder.noAnnType = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["wordvector"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features" if __name__=="__main__": # Import Psyco if available try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" from optparse import OptionParser # For using command line options optparser = OptionParser() optparser.add_option("-i", "--ids", default=None, dest="ids", help="") optparser.add_option("-e", "--examples", default=None, dest="examples", help="") optparser.add_option("-j", "--idOutput", default=None, dest="idOutput", help="") optparser.add_option("-o", "--output", default=None, dest="output", help="") optparser.add_option("-w", "--weights", default=None, dest="weights", help="") optparser.add_option("-c", "--cutoff", type="float", default=10.0, dest="cutoff", help="") (options, args) = optparser.parse_args() #classIds = IdSet(filename="/usr/share/biotext/GeniaChallenge/extension-data/genia/edge-examples/genia-edge-ids.class_names") featureIds = IdSet(filename=options.ids) weightFeatures = readWeights(options.weights, options.cutoff) polynomizeExamples(options.examples, options.output, weightFeatures, featureIds) featureIds.write(options.idOutput)
class ExampleBuilder: structureAnalyzer = None """ ExampleBuilder is the abstract base class for specialized example builders. Example builders take some data and convert it to examples usable by e.g. SVMs. An example builder writes three files, an example-file (in extended Joachim's SVM format) and .class_names and .feature_names files, which contain the names for the class and feature id-numbers. An example builder can also be given pre-existing sets of class and feature ids (optionally in files) so that the generated examples are consistent with other, previously generated examples. """ def __init__(self, classSet=None, featureSet=None): if (type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if (type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = {} self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False def hasStyle(self, style): return style in self.styles and not self.styles[style] def _setDefaultParameters(self, defaults=None, valueLimits=None): # Initialize if self._defaultParameters == None: self._defaultParameters = {} if self._parameterValueLimits == None: self._parameterValueLimits = {} newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) self._defaultParameters.update(newParameters) if valueLimits != None: self._parameterValueLimits.update(valueLimits) def getParameters(self, parameters): return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) def setFeature(self, name, value): self.features[self.featureSet.getId(self.featureTag + name)] = value def getElementCounts(self, filename): print >> sys.stderr, "Counting elements:", if filename.endswith(".gz"): f = gzip.open(filename, "rt") else: f = open(filename, "rt") counts = {"documents": 0, "sentences": 0} for line in f: if "<document" in line: counts["documents"] += 1 elif "<sentence" in line: counts["sentences"] += 1 f.close() print >> sys.stderr, counts return counts def saveIds(self): if self.classIdFilename != None: print >> sys.stderr, "Saving class names to", self.classIdFilename self.classSet.write(self.classIdFilename) else: print >> sys.stderr, "Class names not saved" if self.featureIdFilename != None: print >> sys.stderr, "Saving feature names to", self.featureIdFilename self.featureSet.write(self.featureIdFilename) else: print >> sys.stderr, "Feature names not saved" def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists( os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: # Entered here - Mu self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: # Entered here, 1448 - Mu self.progress = ProgressCounter( self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") # pdb.set_trace() # This line generates log below:(getSentences function generates the first 2 lines) # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113) # Skipped 381 duplicate interaction edges in SentenceGraphs # Defining predicted value range: None - Mu self.calculatePredictedRange( self.getSentences(input, self.parse, self.tokenization) ) # self.parse: mccc; self.tokenization: None removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles[ "keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False # this is True - Mu inputIterator = getCorpusIterator( input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) # pdb.set_trace() #goldIterator = [] if gold != None: # Entered here - Mu removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles[ "keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False # this is False - Mu goldIterator = getCorpusIterator( gold, None, self.parse, self.tokenization, removeIntersentenceInteractions= removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest( inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None # pdb.set_trace() # see the documentation of function processSentence() in this script # inputSentences[1].sentence is the unmerged version # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph, # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Classes:", len(self.classSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString( self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds() def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): #calculatePredictedRange(self, sentences) for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update( 1, "Building examples (" + sentence.sentence.get("id") + "): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None): ''' sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance sentence.sentence: Element 'sentence' in the xml file ''' # pdb.set_trace() # Process filtering rules # does NOT entered here since self.styles["sentenceLimit"] is None - Mu if "sentenceLimit" in self.styles and self.styles[ "sentenceLimit"]: # Rules for limiting which sentences to process # Get the rule list limitRules = self.styles["sentenceLimit"] if type(limitRules) in types.StringTypes: limitRules = [limitRules] # Get the list of sentence element attribute names sentenceElement = sentence.sentence sentenceAttributes = sorted(sentenceElement.attrib.keys()) # Filter sentences based on matching rules to their attribute values for rule in limitRules: for sentAttr in sentenceAttributes: # Rule are of the form "attr.value" where "attr" is the name # of the attribute to match, and "value" a substring within # that attribute if rule.startswith(sentAttr + "."): # rule matches the attribute value = rule.split( ".", 1)[-1] # get the value part of the rule if value not in sentenceElement.get( sentAttr ): # rule value must be a substring of the attribute value return # discard all sentences that do not match all rules # Process the sentence if sentence.sentenceGraph != None: goldGraph = None if goldSentence != None: goldGraph = goldSentence.sentenceGraph # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # self.exampleCount += c self.exampleCount += self.buildExamplesFromGraph( sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # return sentenceGraph_return, argCombinations_return @classmethod def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False): print >> sys.stderr, "Running", cls.__name__ print >> sys.stderr, " input:", input if gold != None: print >> sys.stderr, " gold:", gold print >> sys.stderr, " output:", output, "(append:", str(append) + ")" print >> sys.stderr, " add new class/feature ids:", allowNewIds if not isinstance(style, types.StringTypes): style = Utils.Parameters.toString(style) print >> sys.stderr, " style:", style if tokenization == None: print >> sys.stderr, " parse:", parse else: print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization classSet, featureSet = cls.getIdSets( classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag) builder = cls(style=style, classSet=classSet, featureSet=featureSet) builder.debug = debug #builder.idFileTag = idFileTag builder.classIdFilename = classIds builder.featureIdFilename = featureIds builder.parse = parse builder.tokenization = tokenization builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer) return builder def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): raise NotImplementedError def definePredictedValueRange(self, sentences, elementName): pass def getPredictedValueRange(self): return None @classmethod def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet # if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"): # print >> sys.stderr, "Using predefined class and feature names" # featureSet = IdSet() # featureSet.load(idFileTag + ".feature_names.gz") # classSet = IdSet() # classSet.load(idFileTag + ".class_names") # return classSet, featureSet # else: # print >> sys.stderr, "No predefined class or feature-names" # if idFileTag != None: # assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag # assert(not os.path.exists(idFileTag + ".class_names")), idFileTag # return None, None def getSentences(self, input, parse, tokenization, removeNameInfo=False): # pdb.set_trace() # input is the path to the corpus xml file if type(input) != types.ListType: # Program entered here - Mu # Load corpus and make sentence graphs # pdb.set_trace() corpusElements = Core.SentenceGraph.loadCorpus( input, parse, tokenization, removeNameInfo=removeNameInfo) sentences = [] for sentence in corpusElements.sentences: if sentence.sentenceGraph != None: # required for event detection sentences.append([sentence.sentenceGraph, None]) return sentences else: # assume input is already a list of sentences assert (removeNameInfo == False) return input def calculatePredictedRange(self, sentences): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) self.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, self.getPredictedValueRange()
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples,False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir+"/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log","at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout = logFile, stderr = logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)