def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True defaultParameters["no_arg_count_upper_limit"] = False self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=["typed", "directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer( loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): global speculationWords if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() self.specWords, self.specWordStems = readWords(speculationWords) ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: self.gazetteer = None self.styles = self.getParameters(style, { "classification": "multiclass", "speculation_words": True }, {"classification": ("multiclass", "speculation", "negation")})
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types
def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, ["trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert classSet.getId("neg") == 1 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) # if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert self.pathLengths == None self.types = types
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert( classSet.getId("neg") == 1 ) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "names", "build_for_nameless", "skip_for_nameless", "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", "phospho", "drugbank_features", "ddi13_features", "metamap", "only_types", "ontobiotope_features", "bb_spans", "w2v", "no_context"]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens() #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["w2v"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet)
def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) if gazetteer != None: print >> sys.stderr, "Loading gazetteer from", gazetteer self.gazetteer=Gazetteer.loadGztr(gazetteer) else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.pathGazetteer=None self.pathGazetteerDependencies = None self.pathGazetteerPairs = None if pathGazetteer != None: print >> sys.stderr, "Loading path gazetteer from", pathGazetteer self.pathGazetteer=PathGazetteer.load(pathGazetteer) self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer) self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer) else: print >> sys.stderr, "No path gazetteer loaded" ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.negFrac = negFrac print >> sys.stderr, "Downsampling negatives to", negFrac self.negRand = random.Random() self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if True:#"noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.eventsByOrigId = {} self.headTokensByOrigId = {} self.interSentenceEvents = set() self.examplesByEventOrigId = {} self.skippedByType = {} self.skippedByTypeAndReason = {} self.builtByType = {} self.gazMatchCache = {}
def readARFF(filename): featureSet = IdSet(1) classSet = IdSet(0) f = open(filename,"rt") inData = False lines = f.readlines() counter = ProgressCounter(len(lines),"ARFFLine") examples = [] for line in lines: counter.update(string="Processing line " + str(counter.current + 1) + ": ") line = line.strip() if len(line) == 0 or line[0] == "%": continue elif line[0] == "@": #print line category = line.split()[0].lower() if category == "@attribute": category, name, type = line.split() assert(not inData) if name.lower() == "class": name = name.lower() classNames = type[1:-1].split(",") assert(len(classNames)==2) classSet.defineId(classNames[0].strip(),1) classSet.defineId(classNames[1].strip(),-1) featureSet.getId(name) elif category.lower() == "@relation": assert(not inData) elif category == "@data": inData = True else: assert(inData) count = 1 features = {} for column in line.split(","): if featureSet.getName(count) != "class": features[count] = float(column) else: classId = classSet.getId(column, False) assert(classId != None) count += 1 exampleCount = str(len(examples)) exampleId = "BreastCancer.d" + exampleCount + ".s0.x0" examples.append([exampleId,classId,features,{}]) return examples
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert (classSet.getId("neg") == 1) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self._setDefaultParameters([ "rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "build_for_nameless", "pos_only", "all_tokens", "names", "pos_pairs", "linear_ngrams", "phospho" ]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens( ) #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
def getClassSet(rows, classSet=None): from Core.IdSet import IdSet classNames = set() for row in rows: classNames.add(row["class"]) classNames.add(row["prediction"]) # In the case of multiclass, give integer id:s for the classes if classSet == None: classSet = IdSet() assert(not ("1" in classNames and "neg" in classNames)) assert("1" in classNames or "neg" in classNames) if "1" in classNames: classSet.defineId("1",1) else: classSet.defineId("neg",1) for i in sorted(list(classNames)): if i != "1" and i != "neg": classSet.getId(i) return classSet
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert classSet.getId("neg") == 1 if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self._setDefaultParameters(["co_limits"]) self.styles = self.getParameters(style) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False self.phraseTypeCounts = {}
def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.timerBuildExamples = Timer(False) self.timerCrawl = Timer(False) self.timerCrawlPrecalc = Timer(False) self.timerMatrix = Timer(False) self.timerMatrixPrecalc = Timer(False)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.styles = style
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert classSet.getId("neg") == 1 if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >>sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures", ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self._setDefaultParameters(["co_limits"]) self.styles = self.getParameters(style) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False self.phraseTypeCounts = {}
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures" ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.styles = style self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.counts = {} self.countsPerType = {} self.untypedCounts = {} self.tokenCounts = {}
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): global speculationWords if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() self.specWords, self.specWordStems = readWords(speculationWords) ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: self.gazetteer=None self.styles = self.getParameters(style, {"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")})
def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) #if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True #if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False #if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.counts = {} self.countsPerType = {} self.untypedCounts = {} self.tokenCounts = {}
class ExampleBuilder: structureAnalyzer = None """ ExampleBuilder is the abstract base class for specialized example builders. Example builders take some data and convert it to examples usable by e.g. SVMs. An example builder writes three files, an example-file (in extended Joachim's SVM format) and .class_names and .feature_names files, which contain the names for the class and feature id-numbers. An example builder can also be given pre-existing sets of class and feature ids (optionally in files) so that the generated examples are consistent with other, previously generated examples. """ def __init__(self, classSet=None, featureSet=None): if (type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if (type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = {} self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False def hasStyle(self, style): return style in self.styles and not self.styles[style] def _setDefaultParameters(self, defaults=None, valueLimits=None): # Initialize if self._defaultParameters == None: self._defaultParameters = {} if self._parameterValueLimits == None: self._parameterValueLimits = {} newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) self._defaultParameters.update(newParameters) if valueLimits != None: self._parameterValueLimits.update(valueLimits) def getParameters(self, parameters): return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) def setFeature(self, name, value): self.features[self.featureSet.getId(self.featureTag + name)] = value def getElementCounts(self, filename): print >> sys.stderr, "Counting elements:", if filename.endswith(".gz"): f = gzip.open(filename, "rt") else: f = open(filename, "rt") counts = {"documents": 0, "sentences": 0} for line in f: if "<document" in line: counts["documents"] += 1 elif "<sentence" in line: counts["sentences"] += 1 f.close() print >> sys.stderr, counts return counts def saveIds(self): if self.classIdFilename != None: print >> sys.stderr, "Saving class names to", self.classIdFilename self.classSet.write(self.classIdFilename) else: print >> sys.stderr, "Class names not saved" if self.featureIdFilename != None: print >> sys.stderr, "Saving feature names to", self.featureIdFilename self.featureSet.write(self.featureIdFilename) else: print >> sys.stderr, "Feature names not saved" def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists( os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: # Entered here - Mu self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: # Entered here, 1448 - Mu self.progress = ProgressCounter( self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") # pdb.set_trace() # This line generates log below:(getSentences function generates the first 2 lines) # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113) # Skipped 381 duplicate interaction edges in SentenceGraphs # Defining predicted value range: None - Mu self.calculatePredictedRange( self.getSentences(input, self.parse, self.tokenization) ) # self.parse: mccc; self.tokenization: None removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles[ "keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False # this is True - Mu inputIterator = getCorpusIterator( input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) # pdb.set_trace() #goldIterator = [] if gold != None: # Entered here - Mu removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles[ "keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False # this is False - Mu goldIterator = getCorpusIterator( gold, None, self.parse, self.tokenization, removeIntersentenceInteractions= removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest( inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None # pdb.set_trace() # see the documentation of function processSentence() in this script # inputSentences[1].sentence is the unmerged version # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph, # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Classes:", len(self.classSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString( self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds() def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): #calculatePredictedRange(self, sentences) for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update( 1, "Building examples (" + sentence.sentence.get("id") + "): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None): ''' sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance sentence.sentence: Element 'sentence' in the xml file ''' # pdb.set_trace() # Process filtering rules # does NOT entered here since self.styles["sentenceLimit"] is None - Mu if "sentenceLimit" in self.styles and self.styles[ "sentenceLimit"]: # Rules for limiting which sentences to process # Get the rule list limitRules = self.styles["sentenceLimit"] if type(limitRules) in types.StringTypes: limitRules = [limitRules] # Get the list of sentence element attribute names sentenceElement = sentence.sentence sentenceAttributes = sorted(sentenceElement.attrib.keys()) # Filter sentences based on matching rules to their attribute values for rule in limitRules: for sentAttr in sentenceAttributes: # Rule are of the form "attr.value" where "attr" is the name # of the attribute to match, and "value" a substring within # that attribute if rule.startswith(sentAttr + "."): # rule matches the attribute value = rule.split( ".", 1)[-1] # get the value part of the rule if value not in sentenceElement.get( sentAttr ): # rule value must be a substring of the attribute value return # discard all sentences that do not match all rules # Process the sentence if sentence.sentenceGraph != None: goldGraph = None if goldSentence != None: goldGraph = goldSentence.sentenceGraph # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # self.exampleCount += c self.exampleCount += self.buildExamplesFromGraph( sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # return sentenceGraph_return, argCombinations_return @classmethod def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False): print >> sys.stderr, "Running", cls.__name__ print >> sys.stderr, " input:", input if gold != None: print >> sys.stderr, " gold:", gold print >> sys.stderr, " output:", output, "(append:", str(append) + ")" print >> sys.stderr, " add new class/feature ids:", allowNewIds if not isinstance(style, types.StringTypes): style = Utils.Parameters.toString(style) print >> sys.stderr, " style:", style if tokenization == None: print >> sys.stderr, " parse:", parse else: print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization classSet, featureSet = cls.getIdSets( classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag) builder = cls(style=style, classSet=classSet, featureSet=featureSet) builder.debug = debug #builder.idFileTag = idFileTag builder.classIdFilename = classIds builder.featureIdFilename = featureIds builder.parse = parse builder.tokenization = tokenization builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer) return builder def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): raise NotImplementedError def definePredictedValueRange(self, sentences, elementName): pass def getPredictedValueRange(self): return None @classmethod def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet # if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"): # print >> sys.stderr, "Using predefined class and feature names" # featureSet = IdSet() # featureSet.load(idFileTag + ".feature_names.gz") # classSet = IdSet() # classSet.load(idFileTag + ".class_names") # return classSet, featureSet # else: # print >> sys.stderr, "No predefined class or feature-names" # if idFileTag != None: # assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag # assert(not os.path.exists(idFileTag + ".class_names")), idFileTag # return None, None def getSentences(self, input, parse, tokenization, removeNameInfo=False): # pdb.set_trace() # input is the path to the corpus xml file if type(input) != types.ListType: # Program entered here - Mu # Load corpus and make sentence graphs # pdb.set_trace() corpusElements = Core.SentenceGraph.loadCorpus( input, parse, tokenization, removeNameInfo=removeNameInfo) sentences = [] for sentence in corpusElements.sentences: if sentence.sentenceGraph != None: # required for event detection sentences.append([sentence.sentenceGraph, None]) return sentences else: # assume input is already a list of sentences assert (removeNameInfo == False) return input def calculatePredictedRange(self, sentences): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) self.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, self.getPredictedValueRange()
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples,False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir+"/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log","at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout = logFile, stderr = logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def compareExamples(examples1, examples2, features1, features2=None): ExampleUtils.readExamples(examples1) exampleIter1 = ExampleUtils.readExamples(examples1) exampleIter2 = ExampleUtils.readExamples(examples2) features1 = IdSet(filename=features1) if features2 != None: features2 = IdSet(filename=features2) else: features2 = features1 # Compare feature sets if set(features1.Ids.keys()) != set(features2.Ids.keys()): print "Feature sets differ" # Compare examples counter = ProgressCounter(step=1) for e1, e2 in itertools.izip(exampleIter1, exampleIter2): counter.update() assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2)) if e1[1] != e2[1]: print "Class differs" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) f1 = getFeatureNames(e1, features1) f2 = getFeatureNames(e2, features2) f1Set = set(f1) f2Set = set(f2) f1Only = f1Set.difference(f2Set) f2Only = f2Set.difference(f1Set) if len(f1Only) > 0 or len(f2Only) > 0: print "Features differ" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) if len(f1Only) > 0: print " E1-only features:", f1Only if len(f2Only) > 0: print " E2-only features:", f2Only else: assert len(f1) == len(f2) fCount = 0 differ = False for feature1, feature2 in zip(f1, f2): #f1Id = features1.getId(feature1, createIfNotExist=False) #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation": # print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id] if feature1 != feature2: if not differ: print "Feature order differs for example", e1[0] differ = True print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ", else: f1Id = features1.getId(feature1, createIfNotExist=False) f2Id = features2.getId(feature2, createIfNotExist=False) f1Value = e1[2][f1Id] f2Value = e2[2][f2Id] if f1Value != f2Value: if not differ: print "Feature values differ", e1[0] differ = True print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ", fCount += 1 if differ: print counter.endUpdate()
def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples, False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir + "/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[ str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log", "at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout=logFile, stderr=logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def compareExamples(examples1, examples2, features1, features2=None): ExampleUtils.readExamples(examples1) exampleIter1 = ExampleUtils.readExamples(examples1) exampleIter2 = ExampleUtils.readExamples(examples2) features1 = IdSet(filename=features1) if features2 != None: features2 = IdSet(filename=features2) else: features2 = features1 # Compare feature sets if set(features1.Ids.keys()) != set(features2.Ids.keys()): print "Feature sets differ" # Compare examples counter = ProgressCounter(step=1) for e1, e2 in itertools.izip(exampleIter1, exampleIter2): counter.update() assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2)) if e1[1] != e2[1]: print "Class differs" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) f1 = getFeatureNames(e1, features1) f2 = getFeatureNames(e2, features2) f1Set = set(f1) f2Set = set(f2) f1Only = f1Set.difference(f2Set) f2Only = f2Set.difference(f1Set) if len(f1Only) > 0 or len(f2Only) > 0: print "Features differ" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) if len(f1Only) > 0: print " E1-only features:", f1Only if len(f2Only) > 0: print " E2-only features:", f2Only else: assert len(f1) == len(f2) fCount = 0 differ = False for feature1, feature2 in zip(f1, f2): #f1Id = features1.getId(feature1, createIfNotExist=False) #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation": # print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id] if feature1 != feature2: if not differ: print "Feature order differs for example", e1[0] differ = True print "[" + feature1 + "/" + feature2 + "](" + str( fCount) + ") ", else: f1Id = features1.getId(feature1, createIfNotExist=False) f2Id = features2.getId(feature2, createIfNotExist=False) f1Value = e1[2][f1Id] f2Value = e2[2][f2Id] if f1Value != f2Value: if not differ: print "Feature values differ", e1[0] differ = True print "[" + feature1 + "/" + str( f1Id) + "]" + "[" + str(f1Value) + "/" + str( f2Value) + "]" + "(" + str(fCount) + ") ", fCount += 1 if differ: print counter.endUpdate()
class SingleEdgeExampleBuilder(ExampleBuilder): """ Builds examples based on parse dependencies. An example is generated for each dependency. If there is an annotated interaction edge between those tokens, then the example is positive, otherwise negative. Optionally examples can be generated only between tokens that are heads of entities. """ def __init__(self, style): ExampleBuilder.__init__(self) self.featureBuilder = EdgeFeatureBuilder(self.featureSet) self.style = style if not "binary" in style: self.classSet = IdSet(1) assert (self.classSet.getId("neg") == 1) def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 dependencyEdges = sentenceGraph.dependencyGraph.edges() for depEdge in dependencyEdges: if "headsOnly" in self.style: if (sentenceGraph.tokenIsEntityHead[depEdge[0]] == None) or ( sentenceGraph.tokenIsEntityHead[depEdge[1]] == None): continue edgeFound = False if sentenceGraph.interactionGraph.has_edge(depEdge[0], depEdge[1]): intEdges = sentenceGraph.interactionGraph.get_edge( depEdge[0], depEdge[1]) for intEdge in intEdges: examples.append( self.buildExample(depEdge, intEdge, False, exampleIndex, sentenceGraph)) exampleIndex += 1 edgeFound = True elif "directed" in self.style: examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph)) exampleIndex += 1 if sentenceGraph.interactionGraph.has_edge(depEdge[1], depEdge[0]): intEdges = sentenceGraph.interactionGraph.get_edge( depEdge[1], depEdge[0]) for intEdge in intEdges: examples.append( self.buildExample(depEdge, intEdge, True, exampleIndex, sentenceGraph)) exampleIndex += 1 edgeFound = True elif "directed" in self.style: examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph)) exampleIndex += 1 if (not edgeFound) and (not "directed" in self.style): examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph)) exampleIndex += 1 return examples def buildExample(self, depEdge, intEdge, isReverse, exampleIndex, sentenceGraph): if "binary" in self.style: categoryName = "i" if intEdge != None: category = 1 else: category = -1 else: if intEdge != None: categoryName = intEdge.attrib["type"] if isReverse and "directed" in self.style: categoryName += "_rev" category = self.classSet.getId(categoryName) else: categoryName = "neg" category = 1 features = self.buildFeatures(depEdge, sentenceGraph) # Define extra attributes f.e. for the visualizer if int(depEdge[0].attrib["id"].split("_")[-1]) < int( depEdge[1].attrib["id"].split("_")[-1]): extra = { "xtype": "edge", "type": categoryName, "t1": depEdge[0], "t2": depEdge[1] } extra["deprev"] = False else: extra = { "xtype": "edge", "type": categoryName, "t1": depEdge[1], "t2": depEdge[0] } extra["deprev"] = True return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) def buildFeatures(self, depEdge, sentenceGraph): features = {} self.featureBuilder.setFeatureVector(features) self.featureBuilder.buildEdgeFeatures(depEdge, sentenceGraph, "dep_", text=True, POS=True, annType=True, maskNames=True) self.featureBuilder.buildAttachedEdgeFeatures(depEdge, sentenceGraph, "", text=False, POS=True, annType=False, maskNames=True) self.featureBuilder.buildLinearOrderFeatures(depEdge) self.featureBuilder.setFeatureVector(None) return features
triggerClasses = ["Binding", "Gene_expression", "Localization", "Negative_regulation", "Phosphorylation", "Positive_regulation", "Protein_catabolism", "Regulation", "Transcription"] classSet = IdSet(filename=TRIGGER_IDS+".class_names") for triggerClass in triggerClasses: makeOneClassExamples(TRIGGER_TRAIN_EXAMPLE_FILE, TRIGGER_TRAIN_EXAMPLE_FILE + "-" + triggerClass, triggerClass, classSet) makeOneClassExamples(TRIGGER_TEST_EXAMPLE_FILE, TRIGGER_TEST_EXAMPLE_FILE + "-" + triggerClass, triggerClass, classSet) d = {"neg":1, triggerClass:classSet.getId(triggerClass, False)} triggerClassIds = IdSet(idDict = d) TRIGGER_CLASS_IDS = "trigger-ids-"+triggerClass+".class_names" triggerClassIds.write(TRIGGER_CLASS_IDS) print >> sys.stderr, "Trigger models for parse", PARSE_TAG, "for class", triggerClass TRIGGER_CLASSIFIER_PARAMS="c:" + options.triggerParams if "local" not in options.csc: clear = False if "clear" in options.csc: clear = True if "louhi" in options.csc: c = CSCConnection(CSC_WORKDIR+"/trigger-models-"+triggerClass, "*****@*****.**", clear) else: c = CSCConnection(CSC_WORKDIR+"/trigger-models-"+triggerClass, "*****@*****.**", clear) else: c = None
class ExampleBuilder: structureAnalyzer = None """ ExampleBuilder is the abstract base class for specialized example builders. Example builders take some data and convert it to examples usable by e.g. SVMs. An example builder writes three files, an example-file (in extended Joachim's SVM format) and .class_names and .feature_names files, which contain the names for the class and feature id-numbers. An example builder can also be given pre-existing sets of class and feature ids (optionally in files) so that the generated examples are consistent with other, previously generated examples. """ def __init__(self, classSet=None, featureSet=None): if(type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if(type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = None self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False def _setDefaultParameters(self, defaults=None, valueLimits=None): # Initialize if self._defaultParameters == None: self._defaultParameters = {} if self._parameterValueLimits == None: self._parameterValueLimits = {} newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) self._defaultParameters.update(newParameters) if valueLimits != None: self._parameterValueLimits.update(valueLimits) def getParameters(self, parameters): return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) def setFeature(self, name, value): self.features[self.featureSet.getId(self.featureTag+name)] = value def getElementCounts(self, filename): print >> sys.stderr, "Counting elements:", if filename.endswith(".gz"): f = gzip.open(filename, "rt") else: f = open(filename, "rt") counts = {"documents":0, "sentences":0} for line in f: if "<document" in line: counts["documents"] += 1 elif "<sentence" in line: counts["sentences"] += 1 f.close() print >> sys.stderr, counts return counts def saveIds(self): if self.classIdFilename != None: print >> sys.stderr, "Saving class names to", self.classIdFilename self.classSet.write(self.classIdFilename) else: print >> sys.stderr, "Class names not saved" if self.featureIdFilename != None: print >> sys.stderr, "Saving feature names to", self.featureIdFilename self.featureSet.write(self.featureIdFilename) else: print >> sys.stderr, "Feature names not saved" def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization)) removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles["keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) #goldIterator = [] if gold != None: removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles["keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds() def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): #calculatePredictedRange(self, sentences) for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None): # Process filtering rules if self.styles["sentenceLimit"]: # Rules for limiting which sentences to process # Get the rule list limitRules = self.styles["sentenceLimit"] if type(limitRules) in types.StringTypes: limitRules = [limitRules] # Get the list of sentence element attribute names sentenceElement = sentence.sentence sentenceAttributes = sorted(sentenceElement.attrib.keys()) # Filter sentences based on matching rules to their attribute values for rule in limitRules: for sentAttr in sentenceAttributes: # Rule are of the form "attr.value" where "attr" is the name # of the attribute to match, and "value" a substring within # that attribute if rule.startswith(sentAttr + "."): # rule matches the attribute value = rule.split(".", 1)[-1] # get the value part of the rule if value not in sentenceElement.get(sentAttr): # rule value must be a substring of the attribute value return # discard all sentences that do not match all rules # Process the sentence if sentence.sentenceGraph != None: goldGraph = None if goldSentence != None: goldGraph = goldSentence.sentenceGraph self.exampleCount += self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) @classmethod def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False): print >> sys.stderr, "Running", cls.__name__ print >> sys.stderr, " input:", input if gold != None: print >> sys.stderr, " gold:", gold print >> sys.stderr, " output:", output, "(append:", str(append) + ")" print >> sys.stderr, " add new class/feature ids:", allowNewIds if not isinstance(style, types.StringTypes): style = Utils.Parameters.toString(style) print >> sys.stderr, " style:", style if tokenization == None: print >> sys.stderr, " parse:", parse else: print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization classSet, featureSet = cls.getIdSets(classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag) builder = cls(style=style, classSet=classSet, featureSet=featureSet) builder.debug = debug #builder.idFileTag = idFileTag builder.classIdFilename = classIds builder.featureIdFilename = featureIds builder.parse = parse ; builder.tokenization = tokenization builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer) return builder def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): raise NotImplementedError def definePredictedValueRange(self, sentences, elementName): pass def getPredictedValueRange(self): return None @classmethod def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet # if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"): # print >> sys.stderr, "Using predefined class and feature names" # featureSet = IdSet() # featureSet.load(idFileTag + ".feature_names.gz") # classSet = IdSet() # classSet.load(idFileTag + ".class_names") # return classSet, featureSet # else: # print >> sys.stderr, "No predefined class or feature-names" # if idFileTag != None: # assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag # assert(not os.path.exists(idFileTag + ".class_names")), idFileTag # return None, None def getSentences(self, input, parse, tokenization, removeNameInfo=False): if type(input) != types.ListType: # Load corpus and make sentence graphs corpusElements = Core.SentenceGraph.loadCorpus(input, parse, tokenization, removeNameInfo=removeNameInfo) sentences = [] for sentence in corpusElements.sentences: if sentence.sentenceGraph != None: # required for event detection sentences.append( [sentence.sentenceGraph,None] ) return sentences else: # assume input is already a list of sentences assert(removeNameInfo == False) return input def calculatePredictedRange(self, sentences): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) self.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, self.getPredictedValueRange()
defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1])) newFeatures = {} for k,v in example[2].iteritems(): newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v example[2] = newFeatures ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
triggerClasses = [ "Binding", "Gene_expression", "Localization", "Negative_regulation", "Phosphorylation", "Positive_regulation", "Protein_catabolism", "Regulation", "Transcription" ] classSet = IdSet(filename=TRIGGER_IDS + ".class_names") for triggerClass in triggerClasses: makeOneClassExamples(TRIGGER_TRAIN_EXAMPLE_FILE, TRIGGER_TRAIN_EXAMPLE_FILE + "-" + triggerClass, triggerClass, classSet) makeOneClassExamples(TRIGGER_TEST_EXAMPLE_FILE, TRIGGER_TEST_EXAMPLE_FILE + "-" + triggerClass, triggerClass, classSet) d = {"neg": 1, triggerClass: classSet.getId(triggerClass, False)} triggerClassIds = IdSet(idDict=d) TRIGGER_CLASS_IDS = "trigger-ids-" + triggerClass + ".class_names" triggerClassIds.write(TRIGGER_CLASS_IDS) print >> sys.stderr, "Trigger models for parse", PARSE_TAG, "for class", triggerClass TRIGGER_CLASSIFIER_PARAMS = "c:" + options.triggerParams if "local" not in options.csc: clear = False if "clear" in options.csc: clear = True if "louhi" in options.csc: c = CSCConnection(CSC_WORKDIR + "/trigger-models-" + triggerClass, "*****@*****.**", clear) else: c = CSCConnection(CSC_WORKDIR + "/trigger-models-" + triggerClass, "*****@*****.**", clear)
variantExamples = ExampleUtils.readExamples( os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load( os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load( os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load( os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId( variantClassSet.getName(example[1])) newFeatures = {} for k, v in example[2].iteritems(): newFeatures[invariantFeatureSet.getId( variantFeatureSet.getName(k))] = v example[2] = newFeatures ExampleUtils.writeExamples( variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
class TokenRoleMultiEdgeTypeExampleBuilder(ExampleBuilder): def __init__(self): ExampleBuilder.__init__(self) self.classSet = IdSet(1) assert (self.classSet.getId("neg") == 1) # Results slightly nondeterministic because when there are multiple edges between two # tokens, this currently returns only one, and their order is not defined. def getEdges(self, graph, path): pathEdges = [] edges = graph.edges() for i in range(1, len(path)): found = False for edge in edges: if edge[0] == path[i - 1] and edge[1] == path[i]: pathEdges.append((edge, True)) found = True elif edge[1] == path[i - 1] and edge[0] == path[i]: pathEdges.append((edge, False)) found = True if found == True: break assert (found == True) return pathEdges def addType(self, token, features, sentenceGraph, prefix="annType_"): if sentenceGraph.tokenIsEntityHead[token] != None: features[self.featureSet.getId( "annType_" + sentenceGraph.tokenIsEntityHead[token].attrib["type"])] = 1 def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.to_undirected() #undirected = self.makeUndirected(sentenceGraph.dependencyGraph) paths = NX.all_pairs_shortest_path(undirected, cutoff=4) for i in range(len(sentenceGraph.tokens) - 1): for j in range(i + 1, len(sentenceGraph.tokens)): tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if (sentenceGraph.tokenIsEntityHead[tI] == None) or (sentenceGraph.tokenIsEntityHead[tJ] == None): continue # find the path if paths.has_key(tI) and paths[tI].has_key(tJ): path = paths[tI][tJ] elif paths.has_key(tJ) and paths[tJ].has_key(tI): path = paths[tJ][tI] else: continue if len(path) > 1: #> 2: # define class if sentenceGraph.interactionGraph.has_edge( path[0], path[-1]): categoryName = sentenceGraph.interactionGraph.get_edge( path[0], path[-1]).attrib["type"] self.buildExample(path, sentenceGraph, categoryName, examples, exampleIndex) exampleIndex += 1 else: self.buildExample(path, sentenceGraph, "neg", examples, exampleIndex) exampleIndex += 1 if sentenceGraph.interactionGraph.has_edge( path[-1], path[0]): categoryName = sentenceGraph.interactionGraph.get_edge( path[-1], path[0]).attrib["type"] #categoryName += "_rev" self.buildExample(path[::-1], sentenceGraph, categoryName, examples, exampleIndex) exampleIndex += 1 else: self.buildExample(path[::-1], sentenceGraph, "neg", examples, exampleIndex) exampleIndex += 1 return examples def buildExample(self, path, sentenceGraph, categoryName, examples, exampleIndex): # define features features = {} edges = self.getEdges(sentenceGraph.dependencyGraph, path) features[self.featureSet.getId("len_edges_" + str(len(edges)))] = 1 features[self.featureSet.getId("len")] = len(edges) self.buildPathRoleFeatures(path, edges, sentenceGraph, features) self.buildEdgeCombinations(edges, sentenceGraph, features) #self.buildTerminusFeatures(path[0], "t1", sentenceGraph, features) #self.buildTerminusFeatures(path[-1], "t2", sentenceGraph, features) for edge in edges: self.buildPathEdgeFeatures(edge[0], sentenceGraph, features) # if edges[0][0][0] == path[0]: # t1 = edges[0][0][0] # else: # t1 = edges[0][0][1] # assert(edges[0][0][1] == path[0]) # if edges[-1][0][0] == path[-1]: # t2 = edges[-1][0][0] # else: # t2 = edges[-1][0][1] # assert(edges[-1][0][1] == path[-1]) # self.buildEdgeCombinations(edges, sentenceGraph, features) # self.buildTerminusFeatures(t1, t2, sentenceGraph, features) # define extra attributes if int(path[0].attrib["id"].split("_")[-1]) < int( path[-1].attrib["id"].split("_")[-1]): extra = { "xtype": "edge", "type": "i", "t1": path[0], "t2": path[-1] } extra["deprev"] = False else: extra = { "xtype": "edge", "type": "i", "t1": path[-1], "t2": path[0] } extra["deprev"] = True # make example category = self.classSet.getId(categoryName) examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) def buildPathRoleFeatures(self, pathTokens, pathEdges, sentenceGraph, features): #print len(pathTokens), len(pathEdges) features[self.featureSet.getId("tokTerm1POS_" + pathTokens[0].attrib["POS"])] = 1 features[self.featureSet.getId( "tokTerm1txt_" + sentenceGraph.getTokenText(pathTokens[0]))] = 1 features[self.featureSet.getId("tokTerm2POS_" + pathTokens[-1].attrib["POS"])] = 1 features[self.featureSet.getId( "tokTerm2txt_" + sentenceGraph.getTokenText(pathTokens[-1]))] = 1 # for i in range(0,len(pathEdges)): # if pathEdges[i][1]: # features[self.featureSet.getId("depRight_"+pathEdges[i][0][2].attrib["type"])] = 1 # else: # features[self.featureSet.getId("depLeft_"+pathEdges[i][0][2].attrib["type"])] = 1 for i in range(1, len(pathEdges)): if pathEdges[i - 1][1] and pathEdges[i][1]: features[self.featureSet.getId( "depRight1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "depRight2_" + pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "tokRightPOS_" + pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId( "tokRightTxt_" + sentenceGraph.getTokenText(pathTokens[i]))] = 1 elif (not pathEdges[i - 1][1]) and (not pathEdges[i][1]): features[self.featureSet.getId( "depLeft1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "depLeft2_" + pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "tokLeftPOS_" + pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId( "tokLeftTxt_" + sentenceGraph.getTokenText(pathTokens[i]))] = 1 elif (not pathEdges[i - 1][1]) and pathEdges[i][1]: features[self.featureSet.getId( "depTop1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "depTop2_" + pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "tokTopPOS_" + pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId( "tokTopTxt_" + sentenceGraph.getTokenText(pathTokens[i]))] = 1 elif pathEdges[i - 1][1] and (not pathEdges[i][1]): features[self.featureSet.getId( "depBottom1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "depBottom2_" + pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId( "tokBottomPOS_" + pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId( "tokBottomTxt_" + sentenceGraph.getTokenText(pathTokens[i]))] = 1 def buildPathEdgeFeatures(self, depEdge, sentenceGraph, features): depType = depEdge[2].attrib["type"] features[self.featureSet.getId("dep_" + depType)] = 1 # Token 1 features[self.featureSet.getId( "txt_" + sentenceGraph.getTokenText(depEdge[0]))] = 1 features[self.featureSet.getId("POS_" + depEdge[0].attrib["POS"])] = 1 self.addType(depEdge[0], features, sentenceGraph, prefix="annType_") # Token 2 features[self.featureSet.getId( "txt_" + sentenceGraph.getTokenText(depEdge[1]))] = 1 features[self.featureSet.getId("POS_" + depEdge[1].attrib["POS"])] = 1 self.addType(depEdge[1], features, sentenceGraph, prefix="annType_") def buildEdgeCombinations(self, edges, sentenceGraph, features): # Edges directed relative to the path for i in range(len(edges)): depType = edges[i][0][2].attrib["type"] if edges[i][1]: features[self.featureSet.getId("dep_" + depType + ">")] = 1 else: features[self.featureSet.getId("dep_<" + depType)] = 1 # Edge bigrams if edges[0][1]: features[self.featureSet.getId("internalPOS_" + edges[0][0][0].attrib["POS"])] = 1 features[self.featureSet.getId( "internalTxt_" + sentenceGraph.getTokenText(edges[0][0][0]))] = 1 else: features[self.featureSet.getId("internalPOS_" + edges[0][0][1].attrib["POS"])] = 1 features[self.featureSet.getId( "internalTxt_" + sentenceGraph.getTokenText(edges[0][0][1]))] = 1 if edges[-1][1]: features[self.featureSet.getId("internalPOS_" + edges[-1][0][1].attrib["POS"])] = 1 features[self.featureSet.getId( "internalTxt_" + sentenceGraph.getTokenText(edges[-1][0][1]))] = 1 else: features[self.featureSet.getId("internalPOS_" + edges[-1][0][0].attrib["POS"])] = 1 features[self.featureSet.getId( "internalTxt_" + sentenceGraph.getTokenText(edges[-1][0][0]))] = 1 for i in range(1, len(edges) - 1): features[self.featureSet.getId("internalPOS_" + edges[i][0][0].attrib["POS"])] = 1 features[self.featureSet.getId( "internalTxt_" + sentenceGraph.getTokenText(edges[i][0][0]))] = 1 features[self.featureSet.getId("internalPOS_" + edges[i][0][1].attrib["POS"])] = 1 features[self.featureSet.getId( "internalTxt_" + sentenceGraph.getTokenText(edges[i][0][1]))] = 1 features[self.featureSet.getId("internalDep_" + edges[i][0][2].attrib["type"])] = 1 for i in range(1, len(edges)): type1 = edges[i - 1][0][2].attrib["type"] type2 = edges[i][0][2].attrib["type"] if edges[i - 1][1] and edges[i][1]: features[self.featureSet.getId("dep_" + type1 + ">" + type2 + ">")] = 1 elif edges[i - 1][1] and edges[i][0]: features[self.featureSet.getId("dep_" + type1 + ">" + type2 + "<")] = 1 elif edges[i - 1][0] and edges[i][0]: features[self.featureSet.getId("dep_" + type1 + "<" + type2 + "<")] = 1 elif edges[i - 1][0] and edges[i][1]: features[self.featureSet.getId("dep_" + type1 + "<" + type2 + ">")] = 1 def buildTerminusFeatures(self, token, prefix, sentenceGraph, features): # Attached edges t1InEdges = sentenceGraph.dependencyGraph.in_edges(token) for edge in t1InEdges: features[self.featureSet.getId(prefix + "HangingIn_" + edge[2].attrib["type"])] = 1 features[self.featureSet.getId(prefix + "HangingIn_" + edge[0].attrib["POS"])] = 1 features[self.featureSet.getId( "t1HangingIn_" + sentenceGraph.getTokenText(edge[0]))] = 1 t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token) for edge in t1OutEdges: features[self.featureSet.getId(prefix + "HangingOut_" + edge[2].attrib["type"])] = 1 features[self.featureSet.getId(prefix + "HangingOut_" + edge[1].attrib["POS"])] = 1 features[self.featureSet.getId( "t1HangingOut_" + sentenceGraph.getTokenText(edge[1]))] = 1
class SingleEdgeExampleBuilder(ExampleBuilder): """ Builds examples based on parse dependencies. An example is generated for each dependency. If there is an annotated interaction edge between those tokens, then the example is positive, otherwise negative. Optionally examples can be generated only between tokens that are heads of entities. """ def __init__(self, style): ExampleBuilder.__init__(self) self.featureBuilder = EdgeFeatureBuilder(self.featureSet) self.style = style if not "binary" in style: self.classSet = IdSet(1) assert( self.classSet.getId("neg") == 1 ) def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 dependencyEdges = sentenceGraph.dependencyGraph.edges() for depEdge in dependencyEdges: if "headsOnly" in self.style: if (sentenceGraph.tokenIsEntityHead[depEdge[0]] == None) or (sentenceGraph.tokenIsEntityHead[depEdge[1]] == None): continue edgeFound = False if sentenceGraph.interactionGraph.has_edge(depEdge[0], depEdge[1]): intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[0], depEdge[1]) for intEdge in intEdges: examples.append( self.buildExample(depEdge, intEdge, False, exampleIndex, sentenceGraph) ) exampleIndex += 1 edgeFound = True elif "directed" in self.style: examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) ) exampleIndex += 1 if sentenceGraph.interactionGraph.has_edge(depEdge[1], depEdge[0]): intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[1], depEdge[0]) for intEdge in intEdges: examples.append( self.buildExample(depEdge, intEdge, True, exampleIndex, sentenceGraph) ) exampleIndex += 1 edgeFound = True elif "directed" in self.style: examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) ) exampleIndex += 1 if (not edgeFound) and (not "directed" in self.style): examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) ) exampleIndex += 1 return examples def buildExample(self, depEdge, intEdge, isReverse, exampleIndex, sentenceGraph): if "binary" in self.style: categoryName = "i" if intEdge != None: category = 1 else: category = -1 else: if intEdge != None: categoryName = intEdge.attrib["type"] if isReverse and "directed" in self.style: categoryName += "_rev" category = self.classSet.getId(categoryName) else: categoryName = "neg" category = 1 features = self.buildFeatures(depEdge,sentenceGraph) # Define extra attributes f.e. for the visualizer if int(depEdge[0].attrib["id"].split("_")[-1]) < int(depEdge[1].attrib["id"].split("_")[-1]): extra = {"xtype":"edge","type":categoryName,"t1":depEdge[0],"t2":depEdge[1]} extra["deprev"] = False else: extra = {"xtype":"edge","type":categoryName,"t1":depEdge[1],"t2":depEdge[0]} extra["deprev"] = True return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) def buildFeatures(self, depEdge, sentenceGraph): features = {} self.featureBuilder.setFeatureVector(features) self.featureBuilder.buildEdgeFeatures(depEdge, sentenceGraph, "dep_", text=True, POS=True, annType=True, maskNames=True) self.featureBuilder.buildAttachedEdgeFeatures(depEdge, sentenceGraph, "", text=False, POS=True, annType=False, maskNames=True) self.featureBuilder.buildLinearOrderFeatures(depEdge) self.featureBuilder.setFeatureVector(None) return features
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) if gazetteer != None: print >> sys.stderr, "Loading gazetteer from", gazetteer self.gazetteer = Gazetteer.loadGztr(gazetteer) else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.pathGazetteer = None self.pathGazetteerDependencies = None self.pathGazetteerPairs = None if pathGazetteer != None: print >> sys.stderr, "Loading path gazetteer from", pathGazetteer self.pathGazetteer = PathGazetteer.load(pathGazetteer) self.pathGazetteerDependencies = PathGazetteer.getDependencies( self.pathGazetteer) self.pathGazetteerPairs = PathGazetteer.getPairs( self.pathGazetteer) else: print >> sys.stderr, "No path gazetteer loaded" ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.negFrac = negFrac print >> sys.stderr, "Downsampling negatives to", negFrac self.negRand = random.Random() self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if True: #"noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types self.eventsByOrigId = {} self.headTokensByOrigId = {} self.interSentenceEvents = set() self.examplesByEventOrigId = {} self.skippedByType = {} self.skippedByTypeAndReason = {} self.builtByType = {} self.gazMatchCache = {}
class SingleDependencyTypeExampleBuilder(ExampleBuilder): def __init__(self): ExampleBuilder.__init__(self) self.classSet = IdSet(1) assert( self.classSet.getId("neg") == 1 ) self.featureBuilder = EdgeFeatureBuilder(self.featureSet) def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 dependencyEdges = sentenceGraph.dependencyGraph.edges() for depEdge in dependencyEdges: if (sentenceGraph.tokenIsEntityHead[depEdge[0]] == None) or (sentenceGraph.tokenIsEntityHead[depEdge[1]] == None): continue if sentenceGraph.interactionGraph.has_edge(depEdge[0], depEdge[1]): intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[0], depEdge[1]) for intEdge in intEdges: examples.append( self.buildExample(depEdge, intEdge, False, exampleIndex, sentenceGraph) ) exampleIndex += 1 elif sentenceGraph.interactionGraph.has_edge(depEdge[1], depEdge[0]): intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[1], depEdge[0]) for intEdge in intEdges: examples.append( self.buildExample(depEdge, intEdge, True, exampleIndex, sentenceGraph) ) exampleIndex += 1 else: examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) ) exampleIndex += 1 return examples def buildExample(self, depEdge, intEdge, isReverse, exampleIndex, sentenceGraph): if intEdge != None: categoryName = intEdge.attrib["type"] if isReverse: categoryName += "_rev" #categoryName += ">" #categoryName = "<" + categoryName category = self.classSet.getId(categoryName) else: categoryName = "neg" category = 1 features = self.buildFeatures(depEdge,sentenceGraph) # Define extra attributes f.e. for the visualizer if int(depEdge[0].attrib["id"].split("_")[-1]) < int(depEdge[1].attrib["id"].split("_")[-1]): extra = {"xtype":"edge","type":categoryName,"t1":depEdge[0],"t2":depEdge[1]} extra["deprev"] = False else: extra = {"xtype":"edge","type":categoryName,"t1":depEdge[1],"t2":depEdge[0]} extra["deprev"] = True return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) def buildFeatures(self, depEdge, sentenceGraph): features = {} self.featureBuilder.setFeatureVector(features) self.featureBuilder.buildEdgeFeatures(depEdge, sentenceGraph, "dep_", text=True, POS=True, annType=True, maskNames=True) # Attached edges self.featureBuilder.buildAttachedEdgeFeatures(depEdge, sentenceGraph, "", text=False, POS=True, annType=False, maskNames=True) # t1InEdges = sentenceGraph.dependencyGraph.in_edges(depEdge[0]) # for edge in t1InEdges: # features[self.featureSet.getId("t1HangingIn_"+edge[2].attrib["type"])] = 1 # features[self.featureSet.getId("t1HangingIn_"+edge[0].attrib["POS"])] = 1 # self.addType(edge[0], features, sentenceGraph, prefix="t1HangingInAnn_") # #features[self.featureSet.getId("t1HangingIn_"+sentenceGraph.getTokenText(edge[0]))] = 1 # t1OutEdges = sentenceGraph.dependencyGraph.out_edges(depEdge[0]) # for edge in t1OutEdges: # features[self.featureSet.getId("t1HangingOut_"+edge[2].attrib["type"])] = 1 # features[self.featureSet.getId("t1HangingOut_"+edge[1].attrib["POS"])] = 1 # self.addType(edge[1], features, sentenceGraph, prefix="t1HangingOutAnn_") # #features[self.featureSet.getId("t1HangingOut_"+sentenceGraph.getTokenText(edge[1]))] = 1 # # t2InEdges = sentenceGraph.dependencyGraph.in_edges(depEdge[1]) # for edge in t2InEdges: # features[self.featureSet.getId("t2HangingIn_"+edge[2].attrib["type"])] = 1 # features[self.featureSet.getId("t2HangingIn_"+edge[0].attrib["POS"])] = 1 # self.addType(edge[0], features, sentenceGraph, prefix="t2HangingInAnn_") # #features[self.featureSet.getId("t2HangingIn_"+sentenceGraph.getTokenText(edge[0]))] = 1 # t2OutEdges = sentenceGraph.dependencyGraph.out_edges(depEdge[1]) # for edge in t2OutEdges: # features[self.featureSet.getId("t2HangingOut_"+edge[2].attrib["type"])] = 1 # features[self.featureSet.getId("t2HangingOut_"+edge[1].attrib["POS"])] = 1 # self.addType(edge[1], features, sentenceGraph, prefix="t2HangingOutAnn_") # #features[self.featureSet.getId("t2HangingOut_"+sentenceGraph.getTokenText(edge[1]))] = 1 # Linear order self.featureBuilder.buildLinearOrderFeatures(depEdge) self.featureBuilder.setFeatureVector(None) return features
def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1 or (len(classSet.Ids) == 2 and classSet.getId("neg") == -1)) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder( self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures self._setDefaultParameters([ "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features", "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features", "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only", "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities", "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra", "entity_extra"]) self.styles = self.getParameters(style) #if style == None: # no parameters given # style["typed"] = style["directed"] = style["headsOnly"] = True self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["mask_nodes"]: self.multiEdgeFeatureBuilder.maskNamedEntities = True else: self.multiEdgeFeatureBuilder.maskNamedEntities = False if not self.styles["limit_features"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["noAnnType"]: self.triggerFeatureBuilder.noAnnType = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["wordvector"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
class TokenRoleMultiEdgeTypeExampleBuilder(ExampleBuilder): def __init__(self): ExampleBuilder.__init__(self) self.classSet = IdSet(1) assert( self.classSet.getId("neg") == 1 ) # Results slightly nondeterministic because when there are multiple edges between two # tokens, this currently returns only one, and their order is not defined. def getEdges(self, graph, path): pathEdges = [] edges = graph.edges() for i in range(1, len(path)): found = False for edge in edges: if edge[0] == path[i-1] and edge[1] == path[i]: pathEdges.append((edge, True)) found = True elif edge[1] == path[i-1] and edge[0] == path[i]: pathEdges.append((edge, False)) found = True if found == True: break assert(found==True) return pathEdges def addType(self, token, features, sentenceGraph, prefix="annType_"): if sentenceGraph.tokenIsEntityHead[token] != None: features[self.featureSet.getId("annType_"+sentenceGraph.tokenIsEntityHead[token].attrib["type"])] = 1 def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.to_undirected() #undirected = self.makeUndirected(sentenceGraph.dependencyGraph) paths = NX.all_pairs_shortest_path(undirected, cutoff=4) for i in range(len(sentenceGraph.tokens)-1): for j in range(i+1,len(sentenceGraph.tokens)): tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if (sentenceGraph.tokenIsEntityHead[tI] == None) or (sentenceGraph.tokenIsEntityHead[tJ] == None): continue # find the path if paths.has_key(tI) and paths[tI].has_key(tJ): path = paths[tI][tJ] elif paths.has_key(tJ) and paths[tJ].has_key(tI): path = paths[tJ][tI] else: continue if len(path) > 1:#> 2: # define class if sentenceGraph.interactionGraph.has_edge(path[0], path[-1]): categoryName = sentenceGraph.interactionGraph.get_edge(path[0], path[-1]).attrib["type"] self.buildExample(path, sentenceGraph, categoryName, examples, exampleIndex) exampleIndex += 1 else: self.buildExample(path, sentenceGraph, "neg", examples, exampleIndex) exampleIndex += 1 if sentenceGraph.interactionGraph.has_edge(path[-1], path[0]): categoryName = sentenceGraph.interactionGraph.get_edge(path[-1], path[0]).attrib["type"] #categoryName += "_rev" self.buildExample(path[::-1], sentenceGraph, categoryName, examples, exampleIndex) exampleIndex += 1 else: self.buildExample(path[::-1], sentenceGraph, "neg", examples, exampleIndex) exampleIndex += 1 return examples def buildExample(self, path, sentenceGraph, categoryName, examples, exampleIndex): # define features features = {} edges = self.getEdges(sentenceGraph.dependencyGraph, path) features[self.featureSet.getId("len_edges_"+str(len(edges)))] = 1 features[self.featureSet.getId("len")] = len(edges) self.buildPathRoleFeatures(path, edges, sentenceGraph, features) self.buildEdgeCombinations(edges, sentenceGraph, features) #self.buildTerminusFeatures(path[0], "t1", sentenceGraph, features) #self.buildTerminusFeatures(path[-1], "t2", sentenceGraph, features) for edge in edges: self.buildPathEdgeFeatures(edge[0], sentenceGraph, features) # if edges[0][0][0] == path[0]: # t1 = edges[0][0][0] # else: # t1 = edges[0][0][1] # assert(edges[0][0][1] == path[0]) # if edges[-1][0][0] == path[-1]: # t2 = edges[-1][0][0] # else: # t2 = edges[-1][0][1] # assert(edges[-1][0][1] == path[-1]) # self.buildEdgeCombinations(edges, sentenceGraph, features) # self.buildTerminusFeatures(t1, t2, sentenceGraph, features) # define extra attributes if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra["deprev"] = False else: extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra["deprev"] = True # make example category = self.classSet.getId(categoryName) examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) def buildPathRoleFeatures(self, pathTokens, pathEdges, sentenceGraph, features): #print len(pathTokens), len(pathEdges) features[self.featureSet.getId("tokTerm1POS_"+pathTokens[0].attrib["POS"])] = 1 features[self.featureSet.getId("tokTerm1txt_"+sentenceGraph.getTokenText(pathTokens[0]))] = 1 features[self.featureSet.getId("tokTerm2POS_"+pathTokens[-1].attrib["POS"])] = 1 features[self.featureSet.getId("tokTerm2txt_"+sentenceGraph.getTokenText(pathTokens[-1]))] = 1 # for i in range(0,len(pathEdges)): # if pathEdges[i][1]: # features[self.featureSet.getId("depRight_"+pathEdges[i][0][2].attrib["type"])] = 1 # else: # features[self.featureSet.getId("depLeft_"+pathEdges[i][0][2].attrib["type"])] = 1 for i in range(1,len(pathEdges)): if pathEdges[i-1][1] and pathEdges[i][1]: features[self.featureSet.getId("depRight1_"+pathEdges[i-1][0][2].attrib["type"])] = 1 features[self.featureSet.getId("depRight2_"+pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId("tokRightPOS_"+pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId("tokRightTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1 elif (not pathEdges[i-1][1]) and (not pathEdges[i][1]): features[self.featureSet.getId("depLeft1_"+pathEdges[i-1][0][2].attrib["type"])] = 1 features[self.featureSet.getId("depLeft2_"+pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId("tokLeftPOS_"+pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId("tokLeftTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1 elif (not pathEdges[i-1][1]) and pathEdges[i][1]: features[self.featureSet.getId("depTop1_"+pathEdges[i-1][0][2].attrib["type"])] = 1 features[self.featureSet.getId("depTop2_"+pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId("tokTopPOS_"+pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId("tokTopTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1 elif pathEdges[i-1][1] and (not pathEdges[i][1]): features[self.featureSet.getId("depBottom1_"+pathEdges[i-1][0][2].attrib["type"])] = 1 features[self.featureSet.getId("depBottom2_"+pathEdges[i][0][2].attrib["type"])] = 1 features[self.featureSet.getId("tokBottomPOS_"+pathTokens[i].attrib["POS"])] = 1 features[self.featureSet.getId("tokBottomTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1 def buildPathEdgeFeatures(self, depEdge, sentenceGraph, features): depType = depEdge[2].attrib["type"] features[self.featureSet.getId("dep_"+depType)] = 1 # Token 1 features[self.featureSet.getId("txt_"+sentenceGraph.getTokenText(depEdge[0]))] = 1 features[self.featureSet.getId("POS_"+depEdge[0].attrib["POS"])] = 1 self.addType(depEdge[0], features, sentenceGraph, prefix="annType_") # Token 2 features[self.featureSet.getId("txt_"+sentenceGraph.getTokenText(depEdge[1]))] = 1 features[self.featureSet.getId("POS_"+depEdge[1].attrib["POS"])] = 1 self.addType(depEdge[1], features, sentenceGraph, prefix="annType_") def buildEdgeCombinations(self, edges, sentenceGraph, features): # Edges directed relative to the path for i in range(len(edges)): depType = edges[i][0][2].attrib["type"] if edges[i][1]: features[self.featureSet.getId("dep_"+depType+">")] = 1 else: features[self.featureSet.getId("dep_<"+depType)] = 1 # Edge bigrams if edges[0][1]: features[self.featureSet.getId("internalPOS_"+edges[0][0][0].attrib["POS"])]=1 features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[0][0][0]))]=1 else: features[self.featureSet.getId("internalPOS_"+edges[0][0][1].attrib["POS"])]=1 features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[0][0][1]))]=1 if edges[-1][1]: features[self.featureSet.getId("internalPOS_"+edges[-1][0][1].attrib["POS"])]=1 features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[-1][0][1]))]=1 else: features[self.featureSet.getId("internalPOS_"+edges[-1][0][0].attrib["POS"])]=1 features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[-1][0][0]))]=1 for i in range(1,len(edges)-1): features[self.featureSet.getId("internalPOS_"+edges[i][0][0].attrib["POS"])]=1 features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[i][0][0]))]=1 features[self.featureSet.getId("internalPOS_"+edges[i][0][1].attrib["POS"])]=1 features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[i][0][1]))]=1 features[self.featureSet.getId("internalDep_"+edges[i][0][2].attrib["type"])]=1 for i in range(1,len(edges)): type1 = edges[i-1][0][2].attrib["type"] type2 = edges[i][0][2].attrib["type"] if edges[i-1][1] and edges[i][1]: features[self.featureSet.getId("dep_"+type1+">"+type2+">")] = 1 elif edges[i-1][1] and edges[i][0]: features[self.featureSet.getId("dep_"+type1+">"+type2+"<")] = 1 elif edges[i-1][0] and edges[i][0]: features[self.featureSet.getId("dep_"+type1+"<"+type2+"<")] = 1 elif edges[i-1][0] and edges[i][1]: features[self.featureSet.getId("dep_"+type1+"<"+type2+">")] = 1 def buildTerminusFeatures(self, token, prefix, sentenceGraph, features): # Attached edges t1InEdges = sentenceGraph.dependencyGraph.in_edges(token) for edge in t1InEdges: features[self.featureSet.getId(prefix+"HangingIn_"+edge[2].attrib["type"])] = 1 features[self.featureSet.getId(prefix+"HangingIn_"+edge[0].attrib["POS"])] = 1 features[self.featureSet.getId("t1HangingIn_"+sentenceGraph.getTokenText(edge[0]))] = 1 t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token) for edge in t1OutEdges: features[self.featureSet.getId(prefix+"HangingOut_"+edge[2].attrib["type"])] = 1 features[self.featureSet.getId(prefix+"HangingOut_"+edge[1].attrib["POS"])] = 1 features[self.featureSet.getId("t1HangingOut_"+sentenceGraph.getTokenText(edge[1]))] = 1