def load_model(self, model_fname, train_fname): """ Load the matrix from the model file. """ self.Cinv = genfromtxt("%s.matrix" % model_fname) train_file = SEQUENTIAL_FILE_READER(train_fname) train_vects = train_file.read() train_file.close() (self.D, self.t) = get_train_data(train_vects) para_file = open(model_fname) self.beta = float(para_file.readline().split()[1]) kernel_type = para_file.readline().strip().split("\t")[1] if kernel_type == "GAUSSIAN_QUADRATIC_KERNEL": self.kernel = GAUSSIAN_QUADRATIC_KERNEL() for i in range(0, 4): (para, val) = para_file.readline().strip().split() if para == "theta_0": self.kernel.theta_0 = float(val) elif para == "theta_1": self.kernel.theta_1 = float(val) elif para == "theta_2": self.kernel.theta_2 = float(val) elif para == "theta_3": self.kernel.theta_3 = float(val) para_file.close() pass
def train_GPR(train_fname, model_fname, verbose=True, beta=1, theta_0=None, theta_1=None, theta_2=None, theta_3=None): """ This is the utility function used to train a regression model using Gaussian Process. """ train_file = SEQUENTIAL_FILE_READER(train_fname) train_vects = train_file.read() train_file.close() learner = GPR() learner.verbose = verbose learner.beta = beta kernel = GAUSSIAN_QUADRATIC_KERNEL() if theta_0: kernel.theta_0 = theta_0 if theta_1: kernel.theta_1 = theta_1 if theta_2: kernel.theta_2 = theta_2 if theta_3: kernel.theta_3 = theta_3 learner.set_kernel(kernel) learner.train(train_vects) learner.save_model(model_fname) pass
def predict_GPR(test_fname, train_fname, model_fname, output_fname=None, accuracy=False): """ Predict the outputs for the test instances. If the output is not specified, then write to the standard output. """ test_file = SEQUENTIAL_FILE_READER(test_fname) test_vects = test_file.read() test_file.close() learner = GPR() learner.load_model(model_fname, train_fname) count = 0 error = 0 if output_fname: output_file = SEQUENTIAL_FILE_WRITER(output_fname) else: output_file = SEQUENTIAL_FILE_WRITER(None, "STDOUT") for v in test_vects["vects"]: (mean, variance) = learner.predict(v) output_file.writeLine("%f\t%f\n" % (mean, variance)) if accuracy: error += (v.label - mean)**2 count += 1 error = sqrt(error) / float(count) if accuracy: output_file.writeLine("RMSE = %f\n" % error) output_file.close() pass
def load_model(self, model_fname, train_fname): """ Load the matrix from the model file. """ self.Cinv = genfromtxt("%s.matrix" % model_fname) train_file = SEQUENTIAL_FILE_READER(train_fname) train_vects = train_file.read() train_file.close() (self.D, self.t) = get_train_data(train_vects) para_file = open(model_fname) self.beta = float(para_file.readline().split()[1]) kernel_type = para_file.readline().strip().split("\t")[1] if kernel_type == "GAUSSIAN_QUADRATIC_KERNEL": self.kernel = GAUSSIAN_QUADRATIC_KERNEL() for i in range(0,4): (para,val) = para_file.readline().strip().split() if para == "theta_0": self.kernel.theta_0 = float(val) elif para == "theta_1": self.kernel.theta_1 = float(val) elif para == "theta_2": self.kernel.theta_2 = float(val) elif para == "theta_3": self.kernel.theta_3 = float(val) para_file.close() pass
def predict_GPR(test_fname, train_fname, model_fname, output_fname=None, accuracy=False): """ Predict the outputs for the test instances. If the output is not specified, then write to the standard output. """ test_file = SEQUENTIAL_FILE_READER(test_fname) test_vects = test_file.read() test_file.close() learner = GPR() learner.load_model(model_fname, train_fname) count = 0 error = 0 if output_fname: output_file = SEQUENTIAL_FILE_WRITER(output_fname) else: output_file = SEQUENTIAL_FILE_WRITER(None, "STDOUT") for v in test_vects["vects"]: (mean, variance) = learner.predict(v) output_file.writeLine("%f\t%f\n" % (mean, variance)) if accuracy: error += (v.label - mean) ** 2 count += 1 error = sqrt(error) / float(count) if accuracy: output_file.writeLine("RMSE = %f\n" % error) output_file.close() pass
def test_logreg(model_fname,test_fname,output_fname=None, prob=True,acc=True): """ Predict labels for the test instances using the trained model. If prob is set to True, then show class probabilities. If acc is set to True and if the test instances have labels, then we will predict accuracies for the test instances. If an output_fname is specified we will write the predictions to the file instead of writing to the terminal. """ pred = PREDICTOR() pred.loadModel(model_fname) testFile = SEQUENTIAL_FILE_READER(test_fname) count = 0 E = EVALUATOR(pred.n) if output_fname: output = open(output_fname,"w") else: output = sys.stdout for mv in testFile: v = mv["vect"] (lbl,prob) = pred.predictVect(v) output.write("%d\t%s\n" % (lbl,str(prob))) if pred.n == 2 and v.label == -1 : trueLabel = 0 else: trueLabel = v.label if v.label is not None: E.add(trueLabel,lbl) count += 1 testFile.close() if acc: result = E.getMetrics() E.show(result) pass
def train_TG( classes, train_fname, model_fname, iterations=2, L1=0, heldout_fname=None, crossValidation=None, verbose=False ): """ Train using binary maximum entropy model (i.e. logistic regression) using stocastic gradient decent method. If heldout_fname is given then we will report the accuracy on heldout data after each iteration. If cross-validation is set to a number (e.g. 5 for five-fold cross-validation) then we will perform cross-validation and will report accuracy for each fold as well as the average. You cannot specify both cross-validation and holdout evaluation at the same time. If you do so then an error will be reported. """ TrainSeqFileReader = SEQUENTIAL_FILE_READER(train_fname) trainVects = TrainSeqFileReader.read() TrainSeqFileReader.close() if heldout_fname: HeldoutSeqFileReader = SEQUENTIAL_FILE_READER(heldout_fname) heldoutVects = HeldoutSeqFileReader.read() HeldoutSeqFileReader.close() Learner = TruncatedGradient(classes) Learner.total_iterations = iterations Learner.c = L1 Learner.verbose = verbose if crossValidation: Learner.folds = crossValidation if heldout_fname: Learner.heldoutVects = heldoutVects["vects"] no_features = classes * len(trainVects["featIDs"]) Learner.train(trainVects["vects"], no_features) Learner.writeModel(no_features, model_fname) pass
def train_SGD(classes, train_fname, model_fname, iterations=2, L2=0, heldout_fname=None, crossValidation=None, verbose=False): """ Train using binary maximum entropy model (i.e. logistic regression) using stocastic gradient decent method. If heldout_fname is given then we will report the accuracy on heldout data after each iteration. If cross-validation is set to a number (e.g. 5 for five-fold cross-validation) then we will perform cross-validation and will report accuracy for each fold as well as the average. You cannot specify both cross-validation and holdout evaluation at the same time. If you do so then an error will be reported. """ TrainSeqFileReader = SEQUENTIAL_FILE_READER(train_fname) trainVects = TrainSeqFileReader.read() TrainSeqFileReader.close() heldoutVects = None if heldout_fname: HeldoutSeqFileReader = SEQUENTIAL_FILE_READER(heldout_fname) heldoutVects = HeldoutSeqFileReader.read() HeldoutSeqFileReader.close() Learner = SGD(classes) Learner.total_iterations = iterations Learner.c = L2 Learner.verbose = verbose if crossValidation: Learner.folds = crossValidation if heldout_fname: Learner.heldoutVects = heldoutVects["vects"] no_features = classes * len(trainVects["featIDs"]) Learner.train(trainVects["vects"], no_features) print "Writing the model... %s" % model_fname Learner.writeModel(no_features, model_fname) pass
def test_logreg(model_fname, test_fname, output_fname=None, prob=True, acc=True): """ Predict labels for the test instances using the trained model. If prob is set to True, then show class probabilities. If acc is set to True and if the test instances have labels, then we will predict accuracies for the test instances. If an output_fname is specified we will write the predictions to the file instead of writing to the terminal. """ pred = PREDICTOR() pred.loadModel(model_fname) testFile = SEQUENTIAL_FILE_READER(test_fname) count = 0 E = EVALUATOR(pred.n) if output_fname: output = open(output_fname, "w") else: output = sys.stdout for mv in testFile: v = mv["vect"] (lbl, prob) = pred.predictVect(v) output.write("%d\t%s\n" % (lbl, str(prob))) if pred.n == 2 and v.label == -1: trueLabel = 0 else: trueLabel = v.label if v.label is not None: E.add(trueLabel, lbl) count += 1 testFile.close() if acc: result = E.getMetrics() E.show(result) pass