def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self.parameterGrid = None self.state = None self._job = None self._prevJobStatus = None self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.parameterFormat = "-%k %v" self.parameterDefaults = {"train": None, "classify": None} self.parameterAllowNew = {"train": True, "classify": True} self.parameterValueListKey = {"train": None, "classify": None} self.parameterValueLimits = {"train": None, "classify": None} self.parameterValueTypes = {"train": None, "classify": None} self.trainDirSetting = None self.trainCommand = None self.classifyDirSetting = None self.classifyCommand = None
def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.numFeatures = None
def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self.parameterGrid = None self.state = None self._job = None self._prevJobStatus = None self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.parameterFormat = "-%k %v" self.parameterDefaults = {"train":None, "classify":None} self.parameterAllowNew = {"train":True, "classify":True} self.parameterValueListKey = {"train":None, "classify":None} self.parameterValueLimits = {"train":None, "classify":None} self.parameterValueTypes = {"train":None, "classify":None} self.trainDirSetting = None self.trainCommand = None self.classifyDirSetting = None self.classifyCommand = None
def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self.parameterGrid = None self.state = None self._job = None self._prevJobStatus = None self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.parameterFormat = "-%k %v" self.trainDirSetting = None self.trainCommand = None self.classifyDirSetting = None self.classifyCommand = None
class KerasClassifier(Classifier): def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.numFeatures = None # def saveModel(self, teesModel, tag=""): # Classifier.saveModel(self, teesModel, tag) # if hasattr(self, "numFeatures") and self.numFeatures != None: # teesModel.addStr(tag+"numFeatures", str(self.numFeatures)) def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): print >> sys.stderr, "Predicting devel examples" output = os.path.abspath(output) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) if model == None: classifier.model = model = self.model model = os.path.abspath(model) model = self.connection.upload(model, uncompress=True, replace=replaceRemoteFiles) classifier.predictions = self.connection.getRemotePath(output, True) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteFiles) classifier._filesToRelease = [examples] self.kerasModel = load_model(model) numFeatures = self.kerasModel.layers[0].get_input_shape_at(0)[1] features, classes = datasets.load_svmlight_file(examples, numFeatures) #features = features.toarray() #predictions = self.kerasModel.predict(features, 128, 1) predictions = self.kerasModel.predict_generator(predict_batch_generator(features, 1), features.shape[0] / 1) predClasses = predictions.argmax(axis=-1) predictionsPath = self.connection.getRemotePath(output, False) with open(predictionsPath, "wt") as f: for i in range(predictions.shape[0]): f.write(str(predClasses[i] + 1) + " " + " ".join([str(x) for x in predictions[i]]) + "\n") def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step if step == "RESULTS": # Return already classifier = copy.copy(self) classifier.parameters = parameters classifier.model = self.connection.getRemotePath(outDir + "/model.hdf5", True) return classifier return self.train(examples, outDir, parameters, classifyExamples) def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] if not os.path.exists(outDir): os.makedirs(outDir) trainFeatures, trainClasses = datasets.load_svmlight_file(examples) if classifyExamples != None: develFeatures, develClasses = datasets.load_svmlight_file(classifyExamples, trainFeatures.shape[1]) binarizer = preprocessing.LabelBinarizer() binarizer.fit(trainClasses) trainClasses = binarizer.transform(trainClasses) if classifyExamples != None: develClasses = binarizer.transform(develClasses) print >> sys.stderr, "Training Keras model with parameters:", parameters parameters = Parameters.get(parameters, {"TEES.classifier":"KerasClassifier", "layers":5, "lr":0.001, "epochs":1, "batch_size":64, "patience":10}) np.random.seed(10) classifier.kerasModel = classifier._defineModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) def _defineModel(self, outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses): x = inputLayer = Input(shape=(trainFeatures.shape[1],)) layers = parameters["layers"] if type(layers) not in [types.ListType, types.TupleType]: layers = [layers] for layer in layers: x = Dense(int(layer), activation='relu')(x) x = Dense(trainClasses.shape[1], activation='softmax')(x) kerasModel = Model(inputLayer, x) layersPath = self.connection.getRemotePath(outDir + "/layers.json", False) print >> sys.stderr, "Saving layers to", layersPath self._serializeLayers(kerasModel, layersPath) learningRate = float(parameters["lr"]) #0.001 #float(self.styles.get("lr", 0.001)) print >> sys.stderr, "Using learning rate", learningRate optimizer = Adam(lr=learningRate) print >> sys.stderr, "Compiling model" metrics = ["accuracy"] kerasModel.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=metrics) #, metrics=['accuracy']) kerasModel.summary() return kerasModel def _fitModel(self, outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses): """ Fits the compiled Keras model to the adjacency matrix examples. The model is trained on the train set, validated on the devel set and finally the devel set is predicted using the model. """ print >> sys.stderr, "Fitting model" patience = int(parameters["patience"]) #10 #int(self.styles.get("patience", 10)) print >> sys.stderr, "Early stopping patience:", patience es_cb = EarlyStopping(monitor='val_loss', patience=patience, verbose=1) self.model = self.connection.getRemotePath(outDir + "/model.hdf5", True) modelPath = self.connection.getRemotePath(outDir + "/model.hdf5", False) cp_cb = ModelCheckpoint(filepath=modelPath, save_best_only=True, verbose=1) #self.numFeatures = trainFeatures.shape[1] # #print "SHAPE", trainFeatures.shape, trainClasses.shape, develFeatures.shape, develClasses.shape # self.kerasModel.fit(trainFeatures, trainClasses, # epochs=100, #100 if not "epochs" in self.styles else int(self.styles["epochs"]), # batch_size=64, # shuffle=True, # validation_data=(develFeatures, develClasses), # #sample_weight=self.arrays["train"]["mask"], # callbacks=[es_cb, cp_cb]) self.kerasModel.fit_generator(generator=batch_generator(trainFeatures, trainClasses, int(parameters["batch_size"])), epochs=int(parameters["epochs"]), samples_per_epoch=trainFeatures.shape[0], validation_data=batch_generator(develFeatures, develClasses, int(parameters["batch_size"])), validation_steps=develFeatures.shape[0] / int(parameters["batch_size"]), callbacks=[es_cb, cp_cb]) def _serializeLayers(self, kerasModel, filePath, verbose=False): layers = [] for layer in kerasModel.layers: layers.append({'class_name': layer.__class__.__name__, 'config': layer.get_config()}) if verbose: print >> sys.stderr, "Layer configuration:" print >> sys.stderr, "_________________________________________________________________" for layer in layers: print >> sys.stderr, layer print >> sys.stderr, "_________________________________________________________________" with open(filePath, "wt") as f: json.dump(layers, f, indent=2)
def downloadModel(self, outPath=None, breakConnection=True): assert self.getStatus() == "FINISHED" and self.model != None self.model = self.connection.download(self.model, outPath) if breakConnection: self.connection = UnixConnection() # A local connection return self.model
class ExternalClassifier(Classifier): """ A wrapper for external classifier executables. """ def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self.parameterGrid = None self.state = None self._job = None self._prevJobStatus = None self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.parameterFormat = "-%k %v" self.parameterDefaults = {"train": None, "classify": None} self.parameterAllowNew = {"train": True, "classify": True} self.parameterValueListKey = {"train": None, "classify": None} self.parameterValueLimits = {"train": None, "classify": None} self.parameterValueTypes = {"train": None, "classify": None} self.trainDirSetting = None self.trainCommand = None self.classifyDirSetting = None self.classifyCommand = None def getJob(self): return self._job def getStatus(self): if self._job != None: self._prevJobStatus = self.connection.getJobStatus(self._job) if self._prevJobStatus in ["FINISHED", "FAILED"]: self.state = None self._job = None for filename in self._filesToRelease: ExternalClassifier.getFileCounter(filename, add=-1, createIfNotExist=False) self._filesToRelease = [] if self._prevJobStatus == None: return "FINISHED" else: return self._prevJobStatus def setState(self, stateName): assert self.getStatus() in ["FINISHED", "FAILED"] self.state = stateName self._job = None self._prevJobStatus = None if stateName == "TRAIN" or stateName == "OPTIMIZE": self.model = None self.parameters = None # for all states self.predictions = None #self.optimizeJobs = [] def _getParameterString(self, parameters): paramKeys = sorted(parameters.keys()) idStr = "" paramString = "" for key in paramKeys: if key.startswith("TEES."): continue if len(paramString) > 0 and not paramString.endswith(" "): paramString += " " if parameters[key] != None: paramString += self.parameterFormat.replace("%k", key).replace( "%v", str(parameters[key])).strip() idStr += "-" + str(key) + "_" + str(parameters[key]) else: paramString += self.parameterFormat.replace("%k", key).replace( "%v", "").strip() idStr += "-" + str(key) # sanitize id idStr = idStr.replace(":", ".") idStr = idStr.replace(" ", "_") idStr = "".join([ c for c in idStr if c.isalnum() or c in ('.', '_', "-") ]).rstrip() return paramString, idStr def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile( classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) #parameters = Parameters.get(parameters, valueListKey="c") trainDir = os.path.normpath( self.connection.getSetting(self.trainDirSetting)) + os.path.sep # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] # Train if not os.path.exists(outDir): os.makedirs(outDir) #trainCommand = os.path.join(trainDir, self.trainCommand) trainCommand = self.trainCommand.replace("%d", trainDir) parameters = Parameters.get(parameters, self.parameterDefaults["train"], self.parameterAllowNew["train"], self.parameterValueListKey["train"], self.parameterValueLimits["train"], self.parameterValueTypes["train"]) paramString, idStr = self._getParameterString(parameters) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath( outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand = trainCommand.replace("%p", paramString).replace( "%e", examples).replace("%m", modelPath).strip() self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath( outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath( outDir + "/predictions" + idStr, False) classifyDir = os.path.normpath( self.connection.getSetting( self.classifyDirSetting)) + os.path.sep classifyCommand = self.classifyCommand.replace( "%d", classifyDir).replace("%e", classifyExamples).replace( "%m", modelPath).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) # Run the process jobName = self.trainCommand.split()[0].replace("%d", "") + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath + ".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) self.getStatus() return classifier def downloadModel(self, outPath=None, breakConnection=True): assert self.getStatus() == "FINISHED" and self.model != None self.model = self.connection.download(self.model, outPath) if breakConnection: self.connection = UnixConnection() # A local connection return self.model def downloadPredictions(self, outPath=None): assert self.getStatus() == "FINISHED" and self.predictions != None self.predictions = self.connection.download(self.predictions, outPath) return self.predictions def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("CLASSIFY") # Classify if model == None: classifier.model = model = self.model model = os.path.abspath(model) model = self.connection.upload(model, uncompress=True, replace=replaceRemoteFiles) classifier.predictions = self.connection.getRemotePath(output, True) predictionsPath = self.connection.getRemotePath(output, False) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteFiles) classifier._filesToRelease = [examples] self.connection.clearCommands() classifyDir = os.path.normpath( self.connection.getSetting(self.classifyDirSetting)) + os.path.sep classifyCommand = self.classifyCommand.replace( "%d", classifyDir).replace("%e", examples).replace("%m", model).replace( "%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) classifier._job = self.connection.submit( jobDir=os.path.abspath(os.path.dirname(output)), jobName=self.classifyCommand.split()[0].replace("%d", "") + "-" + os.path.basename(model)) if finishBeforeReturn: self.connection.waitForJob(classifier._job) classifier.downloadPredictions() return classifier def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations( Parameters.get(parameters, valueListKey="c") ) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS"))) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs( [x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate( classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str( bestF)[0:6] evaluation = evaluator.evaluate( classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare( bestResult[0] ) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [ evaluation, trained[i], combinations[i], threshold ] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
class ExternalClassifier(Classifier): """ A wrapper for external classifier executables. """ def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self.parameterGrid = None self.state = None self._job = None self._prevJobStatus = None self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.parameterFormat = "-%k %v" self.trainDirSetting = None self.trainCommand = None self.classifyDirSetting = None self.classifyCommand = None def getJob(self): return self._job def getStatus(self): if self._job != None: self._prevJobStatus = self.connection.getJobStatus(self._job) if self._prevJobStatus in ["FINISHED", "FAILED"]: self.state = None self._job = None for filename in self._filesToRelease: ExternalClassifier.getFileCounter(filename, add=-1, createIfNotExist=False) self._filesToRelease = [] if self._prevJobStatus == None: return "FINISHED" else: return self._prevJobStatus def setState(self, stateName): assert self.getStatus() in ["FINISHED", "FAILED"] self.state = stateName self._job = None self._prevJobStatus = None if stateName == "TRAIN" or stateName == "OPTIMIZE": self.model = None self.parameters = None # for all states self.predictions = None #self.optimizeJobs = [] @classmethod def getUnzipped(cls, filename): """ Temporarily uncompress a file, usually a compressed example file. The uncompressed file appears in the same location as the original file. The /tmp directory is as these examples are usually used by a classifier that is run in separate process, which on clusters might end up on a different node, where the local /tmp is no longer accessible. """ if not filename.endswith(".gz"): return filename tempfilename = filename[:-3] + "-unzipped-temp" # Determine if the uncompressed file does not exist, or needs to be updated uncompress = False if os.path.exists(tempfilename): if os.path.getmtime(filename) > os.path.getmtime(tempfilename): # compressed file has changed uncompress = True else: uncompress = True # Uncompress if needed if uncompress: print >> sys.stderr, "Uncompressing example file", filename subprocess.call("gunzip -cfv " + filename + " > " + tempfilename, shell=True) assert os.path.exists(filename) assert os.path.exists(tempfilename) atexit.register(removeTempUnzipped, tempfilename) # mark for deletion return tempfilename @classmethod def getFileCounter(cls, filename, add=0, createIfNotExist=False, removeIfZero=False): """ Keep track of the number of users on a temporary file """ filename += "-counter" count = 0 if os.path.exists(filename): f = open(filename, "rt") lines = f.readlines() f.close() assert len(lines) == 1, filename count = int(lines[0]) elif not createIfNotExist: return None count += add if count < 0: count = 0 if removeIfZero and count == 0 and os.path.exists(filename): os.remove(filename) else: f = open(filename, "wt") f.write(str(count)) f.close() return count def getExampleFile(self, examples, upload=True, replaceRemote=True, dummy=False): # If examples are in a list, they will be written to a file for SVM-multiclass if examples == None: return None if dummy: return "DUMMY" elif type(examples) == types.ListType: assert False #ExampleUtils.writeExamples(examples, trainPath + "/") else: examplesPath = os.path.normpath(os.path.abspath(examples)) localPath = examplesPath if upload: examplesPath = self.connection.upload(examplesPath, uncompress=True, replace=replaceRemote) if examplesPath == localPath and examplesPath.endswith(".gz"): # no upload happened examplesPath = ExternalClassifier.getUnzipped(examplesPath) # uncompress if not yet uncompressed ExternalClassifier.getFileCounter(examplesPath, 1, createIfNotExist=True) # increase user counter in any case print >> sys.stderr, self.__class__.__name__, "using example file", examples, "as", examplesPath return examplesPath def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) parameters = Parameters.get(parameters, valueListKey="c") trainDir = self.connection.getSetting(self.trainDirSetting) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] # Train if not os.path.exists(outDir): os.makedirs(outDir) trainCommand = os.path.join(trainDir, self.trainCommand) paramKeys = sorted(parameters.keys()) idStr = "" paramString = "" for key in paramKeys: if key.startswith("TEES."): continue if len(paramString) > 0 and not paramString.endswith(" "): paramString += " " if parameters[key] != None: paramString += self.parameterFormat.replace("%k", key).replace("%v", str(parameters[key])).strip() idStr += "-" + str(key) + "_" + str(parameters[key]) else: paramString += self.parameterFormat.replace("%k", key).replace("%v", "").strip() idStr += "-" + str(key) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand = trainCommand.replace("%p", paramString).replace("%e", examples).replace("%m", modelPath).strip() self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False) classifyDir = self.connection.getSetting(self.classifyDirSetting) classifyCommand = os.path.join(classifyDir, self.classifyCommand).replace("%e", classifyExamples).replace("%m", modelPath).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) # Run the process jobName = self.trainCommand.split()[0] + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) self.getStatus() return classifier def downloadModel(self, outPath=None, breakConnection=True): assert self.getStatus() == "FINISHED" and self.model != None self.model = self.connection.download(self.model, outPath) if breakConnection: self.connection = UnixConnection() # A local connection return self.model def downloadPredictions(self, outPath=None): assert self.getStatus() == "FINISHED" and self.predictions != None self.predictions = self.connection.download(self.predictions, outPath) return self.predictions def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("CLASSIFY") # Classify if model == None: classifier.model = model = self.model model = os.path.abspath(model) model = self.connection.upload(model, uncompress=True, replace=replaceRemoteFiles) classifier.predictions = self.connection.getRemotePath(output, True) predictionsPath = self.connection.getRemotePath(output, False) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteFiles) classifier._filesToRelease = [examples] self.connection.clearCommands() classifyDir = self.connection.getSetting(self.classifyDirSetting) classifyCommand = os.path.join(classifyDir, self.classifyCommand).replace("%e", examples).replace("%m", model).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) classifier._job = self.connection.submit(jobDir=os.path.abspath(os.path.dirname(output)), jobName=self.classifyCommand.split()[0] + "-" + os.path.basename(model)) if finishBeforeReturn: self.connection.waitForJob(classifier._job) classifier.downloadPredictions() return classifier def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) ) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6] evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [evaluation, trained[i], combinations[i], threshold] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
class ExternalClassifier(Classifier): """ A wrapper for external classifier executables. """ def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self.parameterGrid = None self.state = None self._job = None self._prevJobStatus = None self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.parameterFormat = "-%k %v" self.parameterDefaults = {"train":None, "classify":None} self.parameterAllowNew = {"train":True, "classify":True} self.parameterValueListKey = {"train":None, "classify":None} self.parameterValueLimits = {"train":None, "classify":None} self.parameterValueTypes = {"train":None, "classify":None} self.trainDirSetting = None self.trainCommand = None self.classifyDirSetting = None self.classifyCommand = None def getJob(self): return self._job def getStatus(self): if self._job != None: self._prevJobStatus = self.connection.getJobStatus(self._job) if self._prevJobStatus in ["FINISHED", "FAILED"]: self.state = None self._job = None for filename in self._filesToRelease: ExternalClassifier.getFileCounter(filename, add=-1, createIfNotExist=False) self._filesToRelease = [] if self._prevJobStatus == None: return "FINISHED" else: return self._prevJobStatus def setState(self, stateName): assert self.getStatus() in ["FINISHED", "FAILED"] self.state = stateName self._job = None self._prevJobStatus = None if stateName == "TRAIN" or stateName == "OPTIMIZE": self.model = None self.parameters = None # for all states self.predictions = None #self.optimizeJobs = [] def _getParameterString(self, parameters): paramKeys = sorted(parameters.keys()) idStr = "" paramString = "" for key in paramKeys: if key.startswith("TEES."): continue if len(paramString) > 0 and not paramString.endswith(" "): paramString += " " if parameters[key] != None: paramString += self.parameterFormat.replace("%k", key).replace("%v", str(parameters[key])).strip() idStr += "-" + str(key) + "_" + str(parameters[key]) else: paramString += self.parameterFormat.replace("%k", key).replace("%v", "").strip() idStr += "-" + str(key) # sanitize id idStr = idStr.replace(":", ".") idStr = idStr.replace(" ", "_") idStr = "".join([c for c in idStr if c.isalnum() or c in ('.','_',"-")]).rstrip() return paramString, idStr def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) #parameters = Parameters.get(parameters, valueListKey="c") trainDir = "" if self.trainDirSetting: trainDir = os.path.normpath(self.connection.getSetting(self.trainDirSetting)) + os.path.sep # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] # Train if not os.path.exists(outDir): os.makedirs(outDir) #trainCommand = os.path.join(trainDir, self.trainCommand) trainCommand = self.trainCommand.replace("%d", trainDir) parameters = Parameters.get(parameters, self.parameterDefaults["train"], self.parameterAllowNew["train"], self.parameterValueListKey["train"], self.parameterValueLimits["train"], self.parameterValueTypes["train"]) paramString, idStr = self._getParameterString(parameters) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand = trainCommand.replace("%p", paramString).replace("%e", examples).replace("%m", modelPath).strip() self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False) classifyDir = "" if self.classifyDirSetting: classifyDir = os.path.normpath(self.connection.getSetting(self.classifyDirSetting)) + os.path.sep classifyCommand = self.classifyCommand.replace("%d", classifyDir).replace("%e", classifyExamples).replace("%m", modelPath).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) # Run the process jobName = self.trainCommand.split()[0].replace("%d", "") + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) self.getStatus() return classifier def downloadModel(self, outPath=None, breakConnection=True): assert self.getStatus() == "FINISHED" and self.model != None self.model = self.connection.download(self.model, outPath) if breakConnection: self.connection = UnixConnection() # A local connection return self.model def downloadPredictions(self, outPath=None): assert self.getStatus() == "FINISHED" and self.predictions != None self.predictions = self.connection.download(self.predictions, outPath) return self.predictions def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("CLASSIFY") # Classify if model == None: classifier.model = model = self.model model = os.path.abspath(model) model = self.connection.upload(model, uncompress=True, replace=replaceRemoteFiles) classifier.predictions = self.connection.getRemotePath(output, True) predictionsPath = self.connection.getRemotePath(output, False) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteFiles) classifier._filesToRelease = [examples] self.connection.clearCommands() classifyDir = "" if self.classifyDirSetting: classifyDir = os.path.normpath(self.connection.getSetting(self.classifyDirSetting)) + os.path.sep classifyCommand = self.classifyCommand.replace("%d", classifyDir).replace("%e", examples).replace("%m", model).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) classifier._job = self.connection.submit(jobDir=os.path.abspath(os.path.dirname(output)), jobName=self.classifyCommand.split()[0].replace("%d", "") + "-" + os.path.basename(model)) if finishBeforeReturn: self.connection.waitForJob(classifier._job) classifier.downloadPredictions() return classifier def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) ) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6] evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [evaluation, trained[i], combinations[i], threshold] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
class KerasClassifier(Classifier): def __init__(self, connection=None): self.defaultEvaluator = None if connection == None: self.connection = UnixConnection() # A local connection else: self.connection = connection self._filesToRelease = [] self.parameters = None self.model = None self.predictions = None self.numFeatures = None # def saveModel(self, teesModel, tag=""): # Classifier.saveModel(self, teesModel, tag) # if hasattr(self, "numFeatures") and self.numFeatures != None: # teesModel.addStr(tag+"numFeatures", str(self.numFeatures)) def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): print >> sys.stderr, "Predicting devel examples" output = os.path.abspath(output) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) if model == None: classifier.model = model = self.model model = os.path.abspath(model) model = self.connection.upload(model, uncompress=True, replace=replaceRemoteFiles) classifier.predictions = self.connection.getRemotePath(output, True) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteFiles) classifier._filesToRelease = [examples] self.kerasModel = load_model(model) numFeatures = self.kerasModel.layers[0].get_input_shape_at(0)[1] features, classes = datasets.load_svmlight_file(examples, numFeatures) #features = features.toarray() #predictions = self.kerasModel.predict(features, 128, 1) predictions = self.kerasModel.predict_generator( predict_batch_generator(features, 1), features.shape[0] / 1) predClasses = predictions.argmax(axis=-1) predictionsPath = self.connection.getRemotePath(output, False) with open(predictionsPath, "wt") as f: for i in range(predictions.shape[0]): f.write( str(predClasses[i] + 1) + " " + " ".join([str(x) for x in predictions[i]]) + "\n") def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step if step == "RESULTS": # Return already classifier = copy.copy(self) classifier.parameters = parameters classifier.model = self.connection.getRemotePath( outDir + "/model.hdf5", True) return classifier return self.train(examples, outDir, parameters, classifyExamples) def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] if not os.path.exists(outDir): os.makedirs(outDir) trainFeatures, trainClasses = datasets.load_svmlight_file(examples) if classifyExamples != None: develFeatures, develClasses = datasets.load_svmlight_file( classifyExamples, trainFeatures.shape[1]) binarizer = preprocessing.LabelBinarizer() binarizer.fit(trainClasses) trainClasses = binarizer.transform(trainClasses) if classifyExamples != None: develClasses = binarizer.transform(develClasses) print >> sys.stderr, "Training Keras model with parameters:", parameters parameters = Parameters.get( parameters, { "TEES.classifier": "KerasClassifier", "layers": 5, "lr": 0.001, "epochs": 1, "batch_size": 64, "patience": 10 }) np.random.seed(10) classifier.kerasModel = classifier._defineModel( outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) def _defineModel(self, outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses): x = inputLayer = Input(shape=(trainFeatures.shape[1], )) layers = parameters["layers"] if type(layers) not in [types.ListType, types.TupleType]: layers = [layers] for layer in layers: x = Dense(int(layer), activation='relu')(x) x = Dense(trainClasses.shape[1], activation='softmax')(x) kerasModel = Model(inputLayer, x) layersPath = self.connection.getRemotePath(outDir + "/layers.json", False) print >> sys.stderr, "Saving layers to", layersPath self._serializeLayers(kerasModel, layersPath) learningRate = float( parameters["lr"]) #0.001 #float(self.styles.get("lr", 0.001)) print >> sys.stderr, "Using learning rate", learningRate optimizer = Adam(lr=learningRate) print >> sys.stderr, "Compiling model" metrics = ["accuracy"] kerasModel.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=metrics) #, metrics=['accuracy']) kerasModel.summary() return kerasModel def _fitModel(self, outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses): """ Fits the compiled Keras model to the adjacency matrix examples. The model is trained on the train set, validated on the devel set and finally the devel set is predicted using the model. """ print >> sys.stderr, "Fitting model" patience = int( parameters["patience"]) #10 #int(self.styles.get("patience", 10)) print >> sys.stderr, "Early stopping patience:", patience es_cb = EarlyStopping(monitor='val_loss', patience=patience, verbose=1) self.model = self.connection.getRemotePath(outDir + "/model.hdf5", True) modelPath = self.connection.getRemotePath(outDir + "/model.hdf5", False) cp_cb = ModelCheckpoint(filepath=modelPath, save_best_only=True, verbose=1) #self.numFeatures = trainFeatures.shape[1] # #print "SHAPE", trainFeatures.shape, trainClasses.shape, develFeatures.shape, develClasses.shape # self.kerasModel.fit(trainFeatures, trainClasses, # epochs=100, #100 if not "epochs" in self.styles else int(self.styles["epochs"]), # batch_size=64, # shuffle=True, # validation_data=(develFeatures, develClasses), # #sample_weight=self.arrays["train"]["mask"], # callbacks=[es_cb, cp_cb]) self.kerasModel.fit_generator( generator=batch_generator(trainFeatures, trainClasses, int(parameters["batch_size"])), epochs=int(parameters["epochs"]), samples_per_epoch=trainFeatures.shape[0], validation_data=batch_generator(develFeatures, develClasses, int(parameters["batch_size"])), validation_steps=develFeatures.shape[0] / int(parameters["batch_size"]), callbacks=[es_cb, cp_cb]) def _serializeLayers(self, kerasModel, filePath, verbose=False): layers = [] for layer in kerasModel.layers: layers.append({ 'class_name': layer.__class__.__name__, 'config': layer.get_config() }) if verbose: print >> sys.stderr, "Layer configuration:" print >> sys.stderr, "_________________________________________________________________" for layer in layers: print >> sys.stderr, layer print >> sys.stderr, "_________________________________________________________________" with open(filePath, "wt") as f: json.dump(layers, f, indent=2)