class ExampleBuilder: structureAnalyzer = None """ ExampleBuilder is the abstract base class for specialized example builders. Example builders take some data and convert it to examples usable by e.g. SVMs. An example builder writes three files, an example-file (in extended Joachim's SVM format) and .class_names and .feature_names files, which contain the names for the class and feature id-numbers. An example builder can also be given pre-existing sets of class and feature ids (optionally in files) so that the generated examples are consistent with other, previously generated examples. """ def __init__(self, classSet=None, featureSet=None): if (type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if (type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = {} self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False def hasStyle(self, style): return style in self.styles and not self.styles[style] def _setDefaultParameters(self, defaults=None, valueLimits=None): # Initialize if self._defaultParameters == None: self._defaultParameters = {} if self._parameterValueLimits == None: self._parameterValueLimits = {} newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) self._defaultParameters.update(newParameters) if valueLimits != None: self._parameterValueLimits.update(valueLimits) def getParameters(self, parameters): return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) def setFeature(self, name, value): self.features[self.featureSet.getId(self.featureTag + name)] = value def getElementCounts(self, filename): print >> sys.stderr, "Counting elements:", if filename.endswith(".gz"): f = gzip.open(filename, "rt") else: f = open(filename, "rt") counts = {"documents": 0, "sentences": 0} for line in f: if "<document" in line: counts["documents"] += 1 elif "<sentence" in line: counts["sentences"] += 1 f.close() print >> sys.stderr, counts return counts def saveIds(self): if self.classIdFilename != None: print >> sys.stderr, "Saving class names to", self.classIdFilename self.classSet.write(self.classIdFilename) else: print >> sys.stderr, "Class names not saved" if self.featureIdFilename != None: print >> sys.stderr, "Saving feature names to", self.featureIdFilename self.featureSet.write(self.featureIdFilename) else: print >> sys.stderr, "Feature names not saved" def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists( os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: # Entered here - Mu self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: # Entered here, 1448 - Mu self.progress = ProgressCounter( self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") # pdb.set_trace() # This line generates log below:(getSentences function generates the first 2 lines) # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113) # Skipped 381 duplicate interaction edges in SentenceGraphs # Defining predicted value range: None - Mu self.calculatePredictedRange( self.getSentences(input, self.parse, self.tokenization) ) # self.parse: mccc; self.tokenization: None removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles[ "keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False # this is True - Mu inputIterator = getCorpusIterator( input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) # pdb.set_trace() #goldIterator = [] if gold != None: # Entered here - Mu removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles[ "keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False # this is False - Mu goldIterator = getCorpusIterator( gold, None, self.parse, self.tokenization, removeIntersentenceInteractions= removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest( inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None # pdb.set_trace() # see the documentation of function processSentence() in this script # inputSentences[1].sentence is the unmerged version # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph, # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Classes:", len(self.classSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString( self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds() def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): #calculatePredictedRange(self, sentences) for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update( 1, "Building examples (" + sentence.sentence.get("id") + "): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None): ''' sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance sentence.sentence: Element 'sentence' in the xml file ''' # pdb.set_trace() # Process filtering rules # does NOT entered here since self.styles["sentenceLimit"] is None - Mu if "sentenceLimit" in self.styles and self.styles[ "sentenceLimit"]: # Rules for limiting which sentences to process # Get the rule list limitRules = self.styles["sentenceLimit"] if type(limitRules) in types.StringTypes: limitRules = [limitRules] # Get the list of sentence element attribute names sentenceElement = sentence.sentence sentenceAttributes = sorted(sentenceElement.attrib.keys()) # Filter sentences based on matching rules to their attribute values for rule in limitRules: for sentAttr in sentenceAttributes: # Rule are of the form "attr.value" where "attr" is the name # of the attribute to match, and "value" a substring within # that attribute if rule.startswith(sentAttr + "."): # rule matches the attribute value = rule.split( ".", 1)[-1] # get the value part of the rule if value not in sentenceElement.get( sentAttr ): # rule value must be a substring of the attribute value return # discard all sentences that do not match all rules # Process the sentence if sentence.sentenceGraph != None: goldGraph = None if goldSentence != None: goldGraph = goldSentence.sentenceGraph # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # self.exampleCount += c self.exampleCount += self.buildExamplesFromGraph( sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) # return sentenceGraph_return, argCombinations_return @classmethod def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False): print >> sys.stderr, "Running", cls.__name__ print >> sys.stderr, " input:", input if gold != None: print >> sys.stderr, " gold:", gold print >> sys.stderr, " output:", output, "(append:", str(append) + ")" print >> sys.stderr, " add new class/feature ids:", allowNewIds if not isinstance(style, types.StringTypes): style = Utils.Parameters.toString(style) print >> sys.stderr, " style:", style if tokenization == None: print >> sys.stderr, " parse:", parse else: print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization classSet, featureSet = cls.getIdSets( classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag) builder = cls(style=style, classSet=classSet, featureSet=featureSet) builder.debug = debug #builder.idFileTag = idFileTag builder.classIdFilename = classIds builder.featureIdFilename = featureIds builder.parse = parse builder.tokenization = tokenization builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer) return builder def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): raise NotImplementedError def definePredictedValueRange(self, sentences, elementName): pass def getPredictedValueRange(self): return None @classmethod def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet # if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"): # print >> sys.stderr, "Using predefined class and feature names" # featureSet = IdSet() # featureSet.load(idFileTag + ".feature_names.gz") # classSet = IdSet() # classSet.load(idFileTag + ".class_names") # return classSet, featureSet # else: # print >> sys.stderr, "No predefined class or feature-names" # if idFileTag != None: # assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag # assert(not os.path.exists(idFileTag + ".class_names")), idFileTag # return None, None def getSentences(self, input, parse, tokenization, removeNameInfo=False): # pdb.set_trace() # input is the path to the corpus xml file if type(input) != types.ListType: # Program entered here - Mu # Load corpus and make sentence graphs # pdb.set_trace() corpusElements = Core.SentenceGraph.loadCorpus( input, parse, tokenization, removeNameInfo=removeNameInfo) sentences = [] for sentence in corpusElements.sentences: if sentence.sentenceGraph != None: # required for event detection sentences.append([sentence.sentenceGraph, None]) return sentences else: # assume input is already a list of sentences assert (removeNameInfo == False) return input def calculatePredictedRange(self, sentences): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) self.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, self.getPredictedValueRange()
class ExampleBuilder: structureAnalyzer = None """ ExampleBuilder is the abstract base class for specialized example builders. Example builders take some data and convert it to examples usable by e.g. SVMs. An example builder writes three files, an example-file (in extended Joachim's SVM format) and .class_names and .feature_names files, which contain the names for the class and feature id-numbers. An example builder can also be given pre-existing sets of class and feature ids (optionally in files) so that the generated examples are consistent with other, previously generated examples. """ def __init__(self, classSet=None, featureSet=None): if(type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if(type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = None self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False def _setDefaultParameters(self, defaults=None, valueLimits=None): # Initialize if self._defaultParameters == None: self._defaultParameters = {} if self._parameterValueLimits == None: self._parameterValueLimits = {} newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) self._defaultParameters.update(newParameters) if valueLimits != None: self._parameterValueLimits.update(valueLimits) def getParameters(self, parameters): return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) def setFeature(self, name, value): self.features[self.featureSet.getId(self.featureTag+name)] = value def getElementCounts(self, filename): print >> sys.stderr, "Counting elements:", if filename.endswith(".gz"): f = gzip.open(filename, "rt") else: f = open(filename, "rt") counts = {"documents":0, "sentences":0} for line in f: if "<document" in line: counts["documents"] += 1 elif "<sentence" in line: counts["sentences"] += 1 f.close() print >> sys.stderr, counts return counts def saveIds(self): if self.classIdFilename != None: print >> sys.stderr, "Saving class names to", self.classIdFilename self.classSet.write(self.classIdFilename) else: print >> sys.stderr, "Class names not saved" if self.featureIdFilename != None: print >> sys.stderr, "Saving feature names to", self.featureIdFilename self.featureSet.write(self.featureIdFilename) else: print >> sys.stderr, "Feature names not saved" def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): # Create intermediate paths if needed if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) # Open output file openStyle = "wt" if append: #print "Appending examples" openStyle = "at" if output.endswith(".gz"): outfile = gzip.open(output, openStyle) else: outfile = open(output, openStyle) # Build examples self.exampleCount = 0 if type(input) in types.StringTypes: self.elementCounts = self.getElementCounts(input) if self.elementCounts["sentences"] > 0: self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") else: self.elementCounts = None self.progress = ProgressCounter(None, "Build examples") self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization)) removeIntersentenceInteractions = True if "keep_intersentence" in self.styles and self.styles["keep_intersentence"]: print >> sys.stderr, "Keeping intersentence interactions for input corpus" removeIntersentenceInteractions = False inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions) #goldIterator = [] if gold != None: removeGoldIntersentenceInteractions = True if "keep_intersentence_gold" in self.styles and self.styles["keep_intersentence_gold"]: print >> sys.stderr, "Keeping intersentence interactions for gold corpus" removeGoldIntersentenceInteractions = False goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeGoldIntersentenceInteractions) for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None): assert inputSentences != None assert goldSentences != None self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer) else: for inputSentences in inputIterator: self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer) outfile.close() self.progress.endUpdate() # Show statistics print >> sys.stderr, "Examples built:", self.exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles)) if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # Save Ids if allowNewIds: self.saveIds() def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): #calculatePredictedRange(self, sentences) for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None): # Process filtering rules if self.styles["sentenceLimit"]: # Rules for limiting which sentences to process # Get the rule list limitRules = self.styles["sentenceLimit"] if type(limitRules) in types.StringTypes: limitRules = [limitRules] # Get the list of sentence element attribute names sentenceElement = sentence.sentence sentenceAttributes = sorted(sentenceElement.attrib.keys()) # Filter sentences based on matching rules to their attribute values for rule in limitRules: for sentAttr in sentenceAttributes: # Rule are of the form "attr.value" where "attr" is the name # of the attribute to match, and "value" a substring within # that attribute if rule.startswith(sentAttr + "."): # rule matches the attribute value = rule.split(".", 1)[-1] # get the value part of the rule if value not in sentenceElement.get(sentAttr): # rule value must be a substring of the attribute value return # discard all sentences that do not match all rules # Process the sentence if sentence.sentenceGraph != None: goldGraph = None if goldSentence != None: goldGraph = goldSentence.sentenceGraph self.exampleCount += self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer) @classmethod def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False): print >> sys.stderr, "Running", cls.__name__ print >> sys.stderr, " input:", input if gold != None: print >> sys.stderr, " gold:", gold print >> sys.stderr, " output:", output, "(append:", str(append) + ")" print >> sys.stderr, " add new class/feature ids:", allowNewIds if not isinstance(style, types.StringTypes): style = Utils.Parameters.toString(style) print >> sys.stderr, " style:", style if tokenization == None: print >> sys.stderr, " parse:", parse else: print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization classSet, featureSet = cls.getIdSets(classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag) builder = cls(style=style, classSet=classSet, featureSet=featureSet) builder.debug = debug #builder.idFileTag = idFileTag builder.classIdFilename = classIds builder.featureIdFilename = featureIds builder.parse = parse ; builder.tokenization = tokenization builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer) return builder def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): raise NotImplementedError def definePredictedValueRange(self, sentences, elementName): pass def getPredictedValueRange(self): return None @classmethod def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet # if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"): # print >> sys.stderr, "Using predefined class and feature names" # featureSet = IdSet() # featureSet.load(idFileTag + ".feature_names.gz") # classSet = IdSet() # classSet.load(idFileTag + ".class_names") # return classSet, featureSet # else: # print >> sys.stderr, "No predefined class or feature-names" # if idFileTag != None: # assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag # assert(not os.path.exists(idFileTag + ".class_names")), idFileTag # return None, None def getSentences(self, input, parse, tokenization, removeNameInfo=False): if type(input) != types.ListType: # Load corpus and make sentence graphs corpusElements = Core.SentenceGraph.loadCorpus(input, parse, tokenization, removeNameInfo=removeNameInfo) sentences = [] for sentence in corpusElements.sentences: if sentence.sentenceGraph != None: # required for event detection sentences.append( [sentence.sentenceGraph,None] ) return sentences else: # assume input is already a list of sentences assert(removeNameInfo == False) return input def calculatePredictedRange(self, sentences): print >> sys.stderr, "Defining predicted value range:", sentenceElements = [] for sentence in sentences: sentenceElements.append(sentence[0].sentenceElement) self.definePredictedValueRange(sentenceElements, "entity") print >> sys.stderr, self.getPredictedValueRange()
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples,False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir+"/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log","at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout = logFile, stderr = logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples, False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir + "/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[ str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log", "at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout=logFile, stderr=logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions