def train(self, sentences, labels, cross_validation = False): x = [] y = [] for i in range(0, len(sentences)): sentence = sentences[i] prev = [] j = 0 for word in sentence: body = word.lower() featurespace = self._construct_featurespace(body, prev) prev.append((body, labels[i][j])) if len(prev) > self.chain_len: del(prev[0]) x.append(featurespace.featureset) j += 1 y.extend(labels[i]) prob = svm.problem(y, x) if cross_validation: param = svm.parameter('-c 1 -v 4 -s 4') svm.train(prob, param) else: param = svm.parameter('-c 1 -s 4') self._svm_model = svm.train(prob, param)
def solve(train_X, train_Y, test_X, test_Y): best_lambda, test_accuracy = train(train_X, train_Y, train_X, train_Y) print("Answer for problem 16 is {0}".format(best_lambda)) best_lambda, test_accuracy = train(train_X, train_Y, test_X, test_Y) print("Answer for problem 17 is {0}".format(best_lambda)) train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 120, 200) best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y) print("best lambda is: {0}".format(best_lambda)) model = liblinearutil.train(train_y, train_x, '-s 0 -c 50 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model) print("Answer for problem 18 is {0}".format((100 - accuracy[0]) / 100)) model = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model) print("Answer for problem 19 is {0}".format((100 - accuracy[0]) / 100)) accuracy = [] for i in range(5): train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 40 * i, 40 * (i + 1)) best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y) accuracy.append(test_accuracy) mean_accuracy = np.mean(accuracy, axis=0) print("Answer for problem 20 is {0}".format(min(mean_accuracy)))
def train(self, sentences, labels, cross_validation=False): x = [] y = [] for i in range(0, len(sentences)): sentence = sentences[i] prev = [] j = 0 for word in sentence: body = word.lower() featurespace = self._construct_featurespace(body, prev) prev.append((body, labels[i][j])) if len(prev) > self.chain_len: del (prev[0]) x.append(featurespace.featureset) j += 1 y.extend(labels[i]) prob = svm.problem(y, x) if cross_validation: param = svm.parameter('-c 1 -v 4 -s 4') svm.train(prob, param) else: param = svm.parameter('-c 1 -s 4') self._svm_model = svm.train(prob, param)
def _complete_training(self, debug=False): """ Forward data to external training and extract classifier information """ if self.str_label_function is not None: self.label_function = eval(self.str_label_function) self.labels = self.label_function() options = "-c %.42f -e %.42f -s %d -B %d" % \ (self.complexity, self.tolerance, self.alg_num, self.offset) for i, w in enumerate(self.weight): options += " -w%d %.42f" % (i, w) if not self.debug: options += " -q" self._log("Liblinear is now quiet!") import liblinearutil param = liblinearutil.parameter(options) problem = liblinearutil.problem(self.labels, self.samples) model = liblinearutil.train(problem, param) self.calculate_classification_vector(model) if self.debug: print self.print_w print self.b
def train_liblinear(args): model_name, gold_dir, dirs = args[0], args[1], args[2:] vectors, predicates = get_data(gold_dir, dirs) prob = problem(map(num_to_class, predicates), vectors) param = parameter('-s 0') model = train(prob, param) save_model(model_name, model)
def tuneParameters(weightString="-w-1 1.0 -w1 1.0 ", trainFile='models/type2_fc_10_11.train', CList=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], BList=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], v=10): labels = [] features = [] bestAccuracy = 0 bestParams = None with open(trainFile) as f: for line in f: if line == '': continue data = line.rstrip().split() labels.append(float(data[0])) data = [float(x.split(':')[1]) for x in data[1:]] features.append(data) for C in CList: for B in BList: myOptions = weightString + "-s 0 -e 0.000001 -c " + str(C) + " -B " + str(B) + " -v " + str(v) accuracy = liblinearutil.train(labels, features, myOptions) print "C: " + str(C) + " B: " + str(B) + " " + str(accuracy) if accuracy > bestAccuracy: bestAccuracy = accuracy bestParams = (C,B) return bestAccuracy, bestParams[0], bestParams[1]
def _complete_training(self, debug=False): """ Forward data to external training and extract classifier information """ if self.str_label_function is not None: self.label_function = eval(self.str_label_function) self.labels = self.label_function() options = "-c %.42f -e %.42f -s %d -B %d" % \ (self.complexity, self.tolerance, self.alg_num, self.offset) for i,w in enumerate(self.weight): options += " -w%d %.42f" % (i, w) if not self.debug: options += " -q" self._log("Liblinear is now quiet!") import liblinearutil param = liblinearutil.parameter(options) problem = liblinearutil.problem(self.labels, self.samples) model = liblinearutil.train(problem, param) self.calculate_classification_vector(model) if self.debug: print self.print_w print self.b
def _train(self, folder): # train classifier on all .txt files in <folder> entities = dict() files = filter(lambda x: x.endswith('.txt'), os.listdir(folder)) for f in files: # assign number to current language print('LogR adds file ' + f) for tweet in LogR._read_from_file(os.path.join(folder, f)): # get features features = self._extract_features(tweet) # push out twit into training set ind = LogR._get_dict_and_update(self.languages, f[:-4]) entities[ind] = entities.get(ind, []) + [features] target = [] objects = [] for lang in entities.keys(): for obj in entities[lang]: target.append(lang) objects.append(obj) print('LogR started training...') self.model = liblinearutil.train(target, objects, '-s 0') print('Finished!')
def best_C(self,x,y): """ training using y=list,x=dict parameter = string of parameters searches for best C """ prob=lu.problem(y,x) para="" para+= "-s 2 -C -B %f -p %f -e %f" % (self.bias, self.p, self.eps) print para para1=lu.parameter(para) self.model=lu.train(prob,para1) best_C, best_rate = lu.train(y, x, para) return best_C, best_rate
def build(self, images, targets, extra): shapes, mean_shape, i_stage = extra n_landmarks = mean_shape.n_points feature_extractor = self.feature_extractor_builder.build( images, shapes, targets, (mean_shape, i_stage)) print("Extracting local binary features for each image.\n") features = [ list(feature_extractor.apply(images[i], shapes[i])) for i in xrange(len(images)) ] print("Features extracted.\n") w = np.zeros(shape=(2 * n_landmarks, len(features[0]))) for lmark in xrange(2 * n_landmarks): print_dynamic( "Learning linear regression coefficients for landmark coordinate {}/{}.\n" .format(lmark, 2 * n_landmarks)) linreg = liblinearutil.train( list(targets[:, lmark]), features, "-s 12 -p 0 -c {}".format(1 / float(len(features)))) w_list = linreg.get_decfun()[0] w[lmark][0:len(w_list)] = w_list return GlobalRegression(feature_extractor, w, mean_shape)
def train(self, data_train, data_dev): """Trains Minitagger on the given data.""" start_time = time.time() assert self.__feature_extractor.is_training # Assert untrained # Extract features (only labeled instances) and pass them to liblinear. [label_list, features_list, _] = \ self.__feature_extractor.extract_features(data_train, False, []) if not self.quiet: print("{0} labeled instances (out of {1})".format( len(label_list), data_train.num_instances)) print("{0} label types".format(len(data_train.label_count))) print("{0} observation types".format( len(data_train.observation_count))) print("\"{0}\" feature template".format( self.__feature_extractor.feature_template)) print("{0} feature types".format( self.__feature_extractor.num_feature_types())) problem = liblinearutil.problem(label_list, features_list) self.__liblinear_model = \ liblinearutil.train(problem, liblinearutil.parameter("-q")) self.__feature_extractor.is_training = False if not self.quiet: num_seconds = int(math.ceil(time.time() - start_time)) print("Training time: {0}".format( str(datetime.timedelta(seconds=num_seconds)))) if data_dev is not None: quiet_value = self.quiet self.quiet = True _, acc = self.predict(data_dev) self.quiet = quiet_value print("Dev accuracy: {0:.3f}%".format(acc))
def _lib_train_liblinear(user_tfidf, num_pos, num_neg, ignore): param = parameter("-s 0") sparse_user_tfidf, num_pos, num_neg = _convert_to_sparse_matrix(user_tfidf, num_pos, num_neg, ignore) labels = ([1] * num_pos) + ([-1] * num_neg) prob = problem(labels, sparse_user_tfidf) modellog = train(prob, param) return modellog
def svm(name): '''Trains a logistic regression model on the feature extracted data. Name is the data set name, e.g. who_won_1031. ''' data = load_dataset(name) return llb.train(data.Y, data.X, '-s 1')
def train(self): sys.stderr.write('creating training problem...') prob = problem(self.labels, self.contexts) sys.stderr.write('done\ntraining with option(s) "' + self.parameters + '"...') self.model = train(prob, parameter(self.parameters)) sys.stderr.write('done\n')
def LDA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords): # set parameters num_topics = 20 burn_in = 1000 # 0 alpha = 0.1 beta = 0.1 samples = 8 spacing = 100 num_test_docs = test_matrix.shape[0] sampler = lda.LDA(num_topics, alpha, beta) print('Starting!') theta, phi, likelihood = sampler.train(matrix, burn_in, samples, spacing) print('likelihood: ', likelihood) theta_test, likelihood = sampler.classify(test_matrix, phi, burn_in, samples, spacing) print('likelihood: ', likelihood) theta = theta / np.sum(theta, 1)[:, None] theta_test = theta_test / np.sum(theta_test, 1)[:, None] svm_model = ll.train(sum(doc_authors, []), theta.tolist(), '-c 4') p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs), theta_test.tolist(), svm_model) author_probs = np.zeros((n_test_docs, n_authors)) for doc, author in enumerate(p_label): author_probs[doc, int(author)] = 1 return author_probs
def train(self, data_train, data_test): """ Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model and the F1_score (macro average of f1_score of each label) @type data_train: SequenceData @param data_train: the training data set @type data_test: SequenceData @param data_test: the test data set """ # keep the training start timestamp start_time = time.time() assert (self.__feature_extractor.is_training ), "In order to train, is_training flag should be True" # Extract features only for labeled instances from data_train [label_list, features_list, _] = self.__feature_extractor.extract_features(data_train, False, []) # print some useful information about the data if not self.quiet: print("{0} labeled words (out of {1})".format( len(label_list), data_train.num_of_words)) print("{0} label types".format(len(data_train.label_count))) print("{0} word types".format(len(data_train.word_count))) print("\"{0}\" feature template".format( self.__feature_extractor.feature_template)) print("{0} feature types".format( self.__feature_extractor.num_feature_types())) # define problem to be trained using the parameters received from the feature_extractor problem = liblinearutil.problem(label_list, features_list) # train the model (-q stands for quiet = True in the liblinearutil) self.__liblinear_model = liblinearutil.train( problem, liblinearutil.parameter("-q")) # training is done, set is_training to False, so that prediction can be done self.__feature_extractor.is_training = False # print some useful information if not self.quiet: num_seconds = int(math.ceil(time.time() - start_time)) # how much did the training last print("Training time: {0}".format( str(datetime.timedelta(seconds=num_seconds)))) # perform prediction on the data_test and report accuracy if data_test is not None: quiet_value = self.quiet self.quiet = True pred_labels, acc = self.predict(data_test) self.quiet = quiet_value self.__save_prediction_to_file(data_test, pred_labels) f1score, precision, recall = report_fscore(self.prediction_path + "/predictions.txt", wikiner=self.wikiner) print("Accuracy: ", acc) # create some files useful for debugging if self.debug: self.__debug(data_test, pred_labels) return f1score, precision, recall
def alternating_train(x, y, lands, c1, c2=1, params='-s 2 -B 1 -q'): nb_views = lands.shape[2] L = np.hstack([lands[:, :, v] for v in range(nb_views)]) M = np.hstack([x[:, :, v] for v in range(nb_views)]) l = len(lands) m = len(x) r0, mask = missing_lstsq(L, M) s0 = np.dot(r0, L) sample = s0.copy() R = r0.copy() y_list = y.tolist() svm = liblin.train(y.tolist(), sample.tolist(), '-c {} '.format(c1) + params) it = 0 while True: it += 1 res = minimize(r_obj_function, R.flatten(), args=(M, y, L, svm, mask, c1, c2), options={'disp': True}) # for i in range(len(x)): # r_i = minimize(r_obj_function, R[i, :], args=(i, M, y, L, svm, mask, c1, c2), options={'disp': False}) # R[i] = r_i.x # cost += r_i.fun print(r_i.fun) R = res.x.reshape(m, l) sample = np.dot(R, L) svm = liblin.train(y.tolist(), sample.tolist(), '-c {} '.format(c1) + params) _, p_acc, _ = liblin.predict(y, sample.tolist(), svm, "-q") print(p_acc) if it == 1: break return svm
def train_liblinear(features, labels, C=1, s=1, folds=10, threads=4): print('Training model...') start = time.time() model = liblinearutil.train( labels, features, '-c {0} -s {1} -v {2} -n {3}'.format(C, 2, folds, threads)) end = time.time() print('Model trained in ' + str(end - start) + ' seconds') return model
def train_regression(self, x, y): data = [] for sample in x: data.append(dict([(self._features.setId(d), sample[d]) for d in sample])) self._regression = True param = liblinear.parameter("-c 1 -s 0") prob = liblinear.problem(y, data) self._model = liblinear.train(prob, param)
def train(self): y = list(self.training[0]) x = list(self.training[1]) y.append(-1) x.append({1:1}) if len(x) <= 1: message('unable to recommend because you have not read any feed.', 'Error', die=True) return liblinearutil.train(y, x, '-q')
def train_regression(self, x, y): data = [] for sample in x: data.append( dict([(self._features.setId(d), sample[d]) for d in sample])) self._regression = True param = liblinear.parameter('-c 1 -s 0') prob = liblinear.problem(y, data) self._model = liblinear.train(prob, param)
def rank_pooling(data): #mean = time_varying_mean(data) #non_linear_mean = non_linearity(mean) normalized_data = normalize(data, axis=1, norm='l2') total_frames = normalized_data.shape[0] labels = list(range(1, total_frames + 1)) data = normalized_data.tolist() model = train(labels, data, '-s 11 -q') return np.array(model.get_decfun()[0])
def train_log_regr_liblinear(features, responses): echo('Training with Liblinear') prob = liblinearutil.problem(responses, features) # -s 0: L2-regularized logistic regression (primal) # -B 1: Fit a bias term # -q: quiet mode param = liblinearutil.parameter('-s 0 -B 1 -q') return liblinearutil.train(prob, param)
def train_SVR_liblinear(features, responses): echo('Training with Liblinear') prob = liblinearutil.problem(responses, features) # -s 11: L2-regularized L2-loss support vector regression (primal) # -B 1: Fit a bias term # -q: quiet mode param = liblinearutil.parameter('-s 11 -B 1 -q') return liblinearutil.train(prob, param)
def train( C, Y_train, X_train, x_lines ): """ This function takes in the training labels and features and creates a model and saves that model :param C : list containing parameter C :param X_train : training features :param Y_train : training labels :return None """ # for c in C: param = '-s 2 -c ' + str(C) model = lu.train(Y_train, X_train, param) lu.save_model("model/lmods2_tamper" + str(round(C,2)) + "_" + str(x_lines) + "l.model", model)
def parallel_train_predict(args): print("A process begins.") x_train,y_train,x_test,y_test=args problem = liblinearutil.problem(y_train, x_train) parameter = liblinearutil.parameter('-s 0 -c 1') time_start = time.clock() model = liblinearutil.train(problem, parameter) print("A process training finished in %f."%(time.clock()-time_start)) time_start = time.clock() p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0') print("A process predicting finished in %f."%(time.clock()-time_start)) return p_val
def validation(k, data_x, data_y, s, e, C): accuracies = [] params = get_params(s, e, C) print('s = {}, e = {}, C = {}'.format(s, e, C)) for fold in range(k): train_x, test_x = get_k_fold(k, fold, data_x) train_y, test_y = get_k_fold(k, fold, data_y) m = liblinearutil.train(train_y, train_x, params) _, p_acc, __ = liblinearutil.predict(test_y, test_x, m) accuracies.append(p_acc[0]) return accuracies
def train(self): y = list(self.training[0]) x = list(self.training[1]) y.append(-1) x.append({1: 1}) if len(x) <= 1: message('unable to recommend because you have not read any feed.', 'Error', die=True) return liblinearutil.train(y, x, '-q')
def train(c, Y_train, X_train): """ This function takes in the training labels and features and creates a model and saves that model :param C : list containing parameter C :param X_train : training features :param Y_train : training labels :return None """ #for c in C: param = '-s 2 -c ' + str(c) model = lu.train(Y_train, X_train, param) lu.save_model("model/lmods2_"+str(round(c,2))+".model", model)
def h_step(features, codes, verbose=True): N, D = features.shape models = [] for (y, i) in zip(codes.T, range(codes.shape[1])): t_start = timeit.default_timer() models.append( liblinearutil.train(y.tolist(), features.tolist(), str('-s 0 -c 4 -q'))) t_end = timeit.default_timer() if verbose: print('[H] {:3d}th bit, {:.4f} seconds elapsed'.format( i, t_end - t_start)) return models
def train(train_data, features, c): x = [] y = [] for key in train_data: y.append(train_data[key]['class']) x.append(features[key]) prob = liblinearutil.problem(y, x) param = liblinearutil.parameter('-q -c ' + str(c) ) model = liblinearutil.train(prob, param) return model
def train(self, x, y, biased=False): data = [] for sample in x: data.append(dict([(self._features.setId(d), sample[d]) for d in sample])) labels = [self._labels.setId(C) for C in y] if self._labels.count() == 2: labels = [1 if label == 1 else -1 for label in labels] param = liblinear.parameter("-c 1 -s 2 -q" + (" -B {0}".format(biased) if biased else "")) else: param = liblinear.parameter("-c 1 -s 4 -q" + (" -B {0}".format(biased) if biased else "")) prob = liblinear.problem(labels, data) self._model = liblinear.train(prob, param)
def unimodalPredDev(gs, feats, nDim): parts = ['dev'] [cccs, preds] = [{} for i in range(2)] for s in parts: cccs[s] = -1.0 warnings.filterwarnings('ignore', category=ConvergenceWarning) #Liblinear for comp in v.C: #Options for liblinear options = "-s "+str(v.sVal)+" -c "+str(comp)+" -B 1 -q" #We learn the model on train model = train(gs['train'][nDim],feats['train'],options) #We predict on data for s in parts: pred = np.array(predict(gs[s][nDim],feats[s],model,"-q"))[0] #We calculate the correlation and store it ccc = cccCalc(np.array(pred),gs[s][nDim]) if (ccc > cccs[s]): preds[s] = pred cccs[s] = ccc function = "SVR" alpha = comp if (v.fullMode == True): #We see if we can do better with sklearn for nbFunc in range(len(v.lFunc)): for c in v.parFunc[nbFunc]: func = v.lFunc[nbFunc] reg = func[0](alpha=c) #One task prediction if (func[1] == 0): reg.fit(feats['train'],gs['train'][nDim]) for s in parts: p = reg.predict(feats['dev']) ccc = cccCalc(p,gs[s][nDim]) if (ccc > cccs[s]) : preds[s] = p cccs[s] = ccc function = func[2] alpha = c #Multi task prediction else : reg.fit(feats['train'],np.transpose(gs['train'])) for s in parts: p = reg.predict(feats['dev'])[:,nDim] ccc = cccCalc(p,gs[s][nDim]) if (ccc > cccs[s]) : preds[s] = p cccs[s] = ccc function = func[2] alpha = c return cccs, preds, function, alpha
def TOKEN_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords): n_docs = matrix.shape[0] n_test_docs = test_matrix.shape[0] matrix = matrix / np.sum(matrix, 1)[:, None] test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None] svm_model = ll.train(sum(doc_authors, []), matrix.tolist(), '-c 4') p_label, p_acc, p_val = ll.predict(np.random.rand(n_test_docs), test_matrix.tolist(), svm_model) author_probs = np.zeros((n_test_docs, n_authors)) for doc, author in enumerate(p_label): author_probs[doc, int(author)] = 1 return author_probs
def train(train_X, train_Y, test_X, test_Y): test_accuracy = [] model_1 = liblinearutil.train(train_Y, train_X, '-s 0 -c 5000 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_1) test_accuracy.append((100 - accuracy[0]) / 100) model_2 = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_2) test_accuracy.append((100 - accuracy[0]) / 100) model_3 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.5 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_3) test_accuracy.append((100 - accuracy[0]) / 100) model_4 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.005 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_4) test_accuracy.append((100 - accuracy[0]) / 100) model_5 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.00005 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_5) test_accuracy.append((100 - accuracy[0]) / 100) return lambda_set[test_accuracy.index(min(test_accuracy))], test_accuracy
def train(self, x, y, biased=False): data = [] for sample in x: data.append( dict([(self._features.setId(d), sample[d]) for d in sample])) labels = [self._labels.setId(C) for C in y] if self._labels.count() == 2: labels = [1 if label == 1 else -1 for label in labels] param = liblinear.parameter( '-c 1 -s 2 -q' + (' -B {0}'.format(biased) if biased else '')) else: param = liblinear.parameter( '-c 1 -s 4 -q' + (' -B {0}'.format(biased) if biased else '')) prob = liblinear.problem(labels, data) self._model = liblinear.train(prob, param)
def eval_SVM(X, y, Xhat, yhat): # create classification problem problem = liblinearutil.problem(y, X) # set SVM parameters svm_param = liblinearutil.parameter('-s 3 -c 10 -q -B 1') # train SVM model = liblinearutil.train(problem, svm_param) # predict and evaluate p_label, p_acc, p_val = liblinearutil.predict(yhat, Xhat, model, '-q') # compute accuracy acc, mse, scc = liblinearutil.evaluations(yhat, p_label) return acc
def build(self, images, targets, extra): shapes, mean_shape, i_stage = extra n_landmarks = mean_shape.n_points feature_extractor = self.feature_extractor_builder.build(images, shapes, targets, (mean_shape, i_stage)) print("Extracting local binary features for each image.\n") features = [ list(feature_extractor.apply(images[i], shapes[i])) for i in xrange(len(images)) ] print("Features extracted.\n") w = np.zeros(shape=(2*n_landmarks, len(features[0]))) for lmark in xrange(2*n_landmarks): print_dynamic("Learning linear regression coefficients for landmark coordinate {}/{}.\n".format(lmark, 2*n_landmarks)) linreg = liblinearutil.train(list(targets[:, lmark]), features, "-s 12 -p 0 -c {}".format(1/float(len(features)))) w_list = linreg.get_decfun()[0] w[lmark][0:len(w_list)] = w_list return GlobalRegression(feature_extractor, w, mean_shape)
def AT_FA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords): # set parameters num_topics = 4 burn_in = 1000 # 0 alpha = 0.1 beta = 0.1 samples = 8 spacing = 100 num_test_docs = test_matrix.shape[0] doc_authors_new, n_authors_new = add_fic_authors(doc_authors, n_authors) sampler = at.AtSampler(num_topics, n_authors_new, alpha, beta) print('Starting!') theta, phi, likelihood = sampler.train(doc_authors_new, matrix, burn_in, samples, spacing) print('theta:', theta.shape) print('phi:', phi.shape) print('likelihood:', likelihood) sampler.n_authors = num_test_docs theta_test = sampler.classify(test_matrix, phi, burn_in, samples, spacing) print('theta test:', theta_test.shape) training_matrix = concatenate_fic_authors(doc_authors, num_topics) num_test_docs = test_matrix.shape[0] test_matrix = np.concatenate((theta_test, theta_test), axis=1) training_matrix = training_matrix / np.sum(training_matrix, 1)[:, None] test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None] svm_model = ll.train(sum(doc_authors, []), training_matrix.tolist(), '-c 4') p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs), test_matrix.tolist(), svm_model) author_probs = np.zeros((n_test_docs, n_authors)) for doc, author in enumerate(p_label): author_probs[doc, int(author)] = 1 return author_probs
def learn_embedding(self, G, **kwargs): print('l2svm G') print(nx.info(G)) npairs = G.number_of_nodes() * (G.number_of_nodes() - 1) self.mapping = {} self.feature_vecs = np.zeros((npairs, len(G.nodes[0]['fingerprint']) * 3)) labels = np.zeros(npairs) - 1 k = 0 nnodes = G.number_of_nodes() for i in range(nnodes - 1): fpi = G.nodes[i]['fingerprint'] for j in range(i + 1, nnodes): fpj = G.nodes[j]['fingerprint'] self.feature_vecs[k] = self._make_feature_vecs(fpi, fpj) labels[k] = labels[k + 1] = int(G.has_edge(i, j)) self.mapping[(i, j)] = k k += 1 self.feature_vecs[k] = self._make_feature_vecs(fpj, fpi) self.mapping[(j, i)] = k k += 1 assert np.all(labels >= 0) print('%d training instance' % (len(labels))) """ self.svm = LinearSVC(loss='hinge', C=self.C, tol=0.1, random_state=self.random_seed, verbose=1) self.svm.fit(self.feature_vecs, labels) print('Training completed') print('Accuracy on training set', self.svm.score(self.feature_vecs[:10], labels[:10])) """ # -s 3 is l2 regularized l1 loss functino params = '-s 2 -C' # parameter selection params = '-s %d -c %f' % (self.s, self.C) print('SVM param:', params) self.svm = train(labels, self.feature_vecs, params) print('SVM instance', self.svm) test_true_edges = list(G.edges())[:10] test_neg_edges = list(nx.complement(G).edges())[:10] yscore = self.get_edge_scores(test_true_edges + test_neg_edges) ylabel = np.concatenate((np.ones(10), np.zeros(10))) auc = roc_auc_score(ylabel, yscore) print('AUC on training set', auc) self.G = G
def main(): if __name__ == "__main__": y, x = svm_read_problem(feature_file, return_scipy=True) # train:test = 7:3 train_X = x[:14000] train_y = y[:14000] test_X = x[14000:] test_y = y[14000:] prob = problem(train_y, train_X) param = parameter("-c 1 -s 2") model = train(prob, param) p_labs, p_acc, p_vals = predict(test_y, test_X, model) accuracy, precision, recall = metrics_result(test_y, p_labs) print print "accuracy: ", accuracy print "precision: ", precision print "recall: ", recall
def train(self,x,y): """ training using y=list,x=dict parameter = string of parameters """ prob=lu.problem(y,x) para="" para+= "-s %d -c %f -B %f -p %f -e %f" % (self.L, self.c, self.bias, self.p, self.eps) if(self.v!=0): para+=" -v %d" % self.v if(self.q!=0): para+= " -q" print para para1=lu.parameter(para) self.model=lu.train(prob,para1) return True
def run_classifier(train_file, test_file): count_one=0 y_train, x_train = svm_read_problem(train_file) counter=0 while counter<len(y_train): if y_train[counter]==-1: count_one=count_one+1 counter=counter+1 w1=count_one/float(len(y_train)) #w1=0.95 # Extra credit #w1=0.95 # Extra credit param='-s 0 -w1 '+str(w1)+' -w-1 '+str(1-w1) #param='-s 0' # Extra Credit model = train(y_train, x_train, param) y_test, x_test = svm_read_problem(test_file) p_labels, p_acc, p_vals = predict(y_test, x_test, model, '-b 1') accuracy = p_acc[0] index=0 if model.label[0]==1: index=0 elif model.label[1]==1: index=1 counter=0 prob_list=[] while counter<len(p_vals): prob_list.append(p_vals[counter][index]) counter=counter+1 output_tup=(p_labels, y_test, prob_list) return output_tup
def unimodalPredTest(gs, feats, nDim, func, c): [cccs, preds] = [{} for i in range(2)] for s in v.aPart: cccs[s] = -1.0 warnings.filterwarnings('ignore', category=ConvergenceWarning) if (func == "SVR"): #Options for liblinear options = "-s " + str(v.sVal) + " -c " + str(c) + " -B 1 -q" #We learn the model on train model = train(gs['train'][nDim], feats['train'], options) #We predict on data for s in v.aPart: pred = np.array(predict(gs[s][nDim], feats[s], model, "-q"))[0] #We calculate the correlation and store it ccc = cccCalc(np.array(pred), gs[s][nDim]) if (ccc > cccs[s]): preds[s] = pred cccs[s] = ccc else: for f in v.lFunc: if (f[2] == func): fun = f reg = fun[0](alpha=c) if (fun[1] == 0): reg.fit(feats['train'], gs['train'][nDim]) for s in v.aPart: p = reg.predict(feats[s]) ccc = cccCalc(p, gs[s][nDim]) if (ccc > cccs[s]): preds[s] = p cccs[s] = ccc else: reg.fit(feats['train'], np.transpose(gs['train'])) for s in v.aPart: p = reg.predict(feats[s])[:, nDim] ccc = cccCalc(p, gs[s][nDim]) if (ccc > cccs[s]): preds[s] = p cccs[s] = ccc return cccs, preds, func, c
def _svm_test_attr_unit(worker_idx, idx_attr_rng, feat_train, feat_test, label_train, label_test, attr_entry, cache_dir): idx_list = range(idx_attr_rng[0], idx_attr_rng[1]) c_list = [0.1, 1., 10.] pred = np.zeros((label_test.shape[0], len(idx_list)), dtype=np.float32) for i, idx in enumerate(idx_list): t = time.time() l_train = label_train[:, idx].astype(np.int) l_test = label_test[:, idx].astype(np.int) w1 = l_train.size / l_train.sum() - 1 # w1 = 1. # if param_C_by_CV: # c, _ = liblinear.train(l_train, feat_train, '-s 0 -B 1. -C -w1 %f -q' % w1) # c = max(0.1, c) # else: # c = 512. best_acc = -1. for c in c_list: svm_model = liblinear.train(l_train, feat_train, '-s 0 -B 1. -c %f -w1 %f -q' % (c, w1)) svm_out = liblinear.predict(l_test, feat_test, svm_model, '-b 1 -q') acc = svm_out[1][0] if acc > best_acc: best_acc = acc best_c = c k = svm_model.get_labels().index(1) prob = np.array(svm_out[2])[:, k] pred[:, i] = prob print( 'worker [%d]: "%s(%d)" [%d/%d], acc: %f, c: %f, time cost: %.2f sec' % (worker_idx, attr_entry[idx]['entry'], idx, i, len(idx_list), best_acc, best_c, time.time() - t)) io.save_data(pred, os.path.join(cache_dir, '%d.pkl' % worker_idx))
def Train(self): # Check classifier type if(self.classifierType == "SVM"): if(self.packageType == "liblinear"): from liblinearutil import train self.cParam = 4# Best cross validation accuracy self.nFoldsParam = 10 self.classifierModel = train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam)) train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam) + ' -v ' + str(self.nFoldsParam)) if(self.packageType == "libsvm"): from svmutil import svm_train self.cParam = 32# Best cross validation accuracy self.nFoldsParam = 10 self.classifierModel = train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam)) train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam) + ' -v ' + str(self.nFoldsParam)) elif(self.classifierType == "DecisionTree"): if(self.packageType == "nltk"): import nltk train_set = [] i = 0; weights = []; for fet in self.trainFeatures: train_set.append((self.trainFeatures[i],self.trainTargets[i])) weights.append( i * 0.5) i +=1 self.classifierModel = nltk.DecisionTreeClassifier.train(train_set,entropy_cutoff=.01,depth_cutoff=300,binary=True,verbose=True) ''' self.classifierModel = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=1000), algorithm="SAMME", n_estimators=200) ''' ''' self.classifierModel = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1000), algorithm="SAMME", n_estimators=200) ''' #self.classifierModel.fit(train_set) #sorted(self.classifierModel.labels()) #print(self.classifierModel) elif(self.packageType == "sklearn"): import sklearn.tree self.classifierModel = sklearn.tree.DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=1000) # Convert into array not dictionary trainFeatures = [] for feature in self.trainFeatures: trainFeatures.append(list(feature.values())) self.classifierModel.fit(trainFeatures, self.trainTargets) elif(self.classifierType == "AdaBoost"): if(self.packageType == "sklearn"): import sklearn.ensemble if(self.baseClassifierType == "DecisionTree"): import sklearn.tree self.classifierModel = sklearn.ensemble.AdaBoostClassifier( sklearn.tree.DecisionTreeClassifier(criterion="gini", splitter="best", max_depth=1000), algorithm="SAMME", n_estimators=200) # Convert into array not dictionary trainFeatures = [] for feature in self.trainFeatures: trainFeatures.append(list(feature.values())) self.classifierModel.fit(trainFeatures, self.trainTargets) else: print("Only DecisionTree is supported as base classifier") else: print("Only sklearn is supported for AdaBoost") else: print("Not supported classifier type")
def train_model(): """训练模型 """ y, x = svm_read_problem(TRAIN_INPUT_FILE) m = train(y, x, "-c 4") save_model(SVM_MODEL_FILE, m)
def svm_test_single_attr(): # config tar_attr_idx = 1 train_on_val_set = True reduced_dim = 512 whiten = True num_attr = 1000 opt = TestAttributeOptions().parse() # extract feature feat_data = extract_feature(opt) feat_train = feat_data['feat_train'] feat_test = feat_data['feat_test'] print('extract feature done!') # load attribute label attr_label = io.load_data('datasets/DeepFashion/Fashion_design/' + opt.fn_label) attr_entry = io.load_json('datasets/DeepFashion/Fashion_design/' + opt.fn_entry) label_train = np.array( [attr_label[s_id] for s_id in feat_data['id_list_train']]) label_test = np.array( [attr_label[s_id] for s_id in feat_data['id_list_test']]) label_train = label_train[:, 0:num_attr] label_test = label_test[:, 0:num_attr] # label_train = np.random.choice([0,1], size = (feat_train.shape[0], num_attr)) # label_test = np.random.choice([0,1], size = (feat_test.shape[0], num_attr)) # get validation feature and label id_list_val = io.load_json( 'datasets/DeepFashion/Fashion_design/Split/ca_split.json')['val'] id2idx = {s_id: idx for idx, s_id in enumerate(feat_data['id_list_train'])} idx_list_val = [id2idx[s_id] for s_id in id_list_val] feat_val = feat_train[idx_list_val, :] label_val = label_train[idx_list_val, :] if train_on_val_set: feat_train = feat_val label_train = label_val print('PCA reduction and whitening...') t = time.time() pca = PCA(n_components=reduced_dim, whiten=whiten) pca.fit(feat_train) feat_train = pca.transform(feat_train) feat_test = pca.transform(feat_test) print('PCA done! (%f sec)' % (time.time() - t)) t = time.time() print( 'selected attribute: %s(%d)' % (attr_entry[tar_attr_idx]['entry'], attr_entry[tar_attr_idx]['type'])) label_train = label_train[:, tar_attr_idx].astype(np.int) label_test = label_test[:, tar_attr_idx].astype(np.int) # w1 = label_train.size / label_train.sum() - 1 w1 = 1. print('w1: %f' % w1) # best_c , _= liblinear.train(label_train, feat_train, '-s 0 -B 1. -C -w1 %f -q' % w1) for c in [0.1, 1., 10.]: svm_model = liblinear.train(label_train, feat_train, '-s 0 -B 1. -c %f -w1 %f -q' % (c, w1)) svm_out = liblinear.predict(label_test, feat_test, svm_model, '-b 1 -q') print('c = %f, acc = %f' % (c, svm_out[1][0])) k = svm_model.get_labels().index(1) prob = np.array(svm_out[2])[:, k] print('SVM training time: %f sec' % (time.time() - t)) crit_ap = MeanAP() crit_ap.add(prob.reshape(-1, 1), label_test.reshape(-1, 1)) ap, _ = crit_ap.compute_mean_ap() print('AP: %f' % ap)
def train(self): if os.path.isfile("svm.model") and self.useModel: self.model = llu.load_model("svm.model") else: self.model = llu.train(self.ys, self.xs, self.train_param) llu.save_model("svm.model", self.model)
def train(word_dict): get_feature(word_dict, "data/train.dat", "data/train.format") get_feature(word_dict, "data/test.dat", "data/test.format") train_y, train_x = linear.svm_read_problem("data/train.format") model = linear.train(train_y, train_x) linear.save_model("model.dat", model)
def train(x, y, c, params='-s 2 -B 1 -q'): return liblin.train(y.tolist(), x.tolist(), '-c {} '.format(c) + params)
def classify(ds_cur = None): from os import chdir, system chdir('./liblinear-2.1/python/') from liblinearutil import problem, parameter, train, predict chdir('../../') from pdb import set_trace from tqdm import tqdm from pymongo import MongoClient from json import dumps from bson.objectid import ObjectId set_trace() dont_include = {'_id' : 0} print 'List of variables:\n' for key in variable_lookup: print key[1] ch1 = raw_input('Input "s" to select custom fields (default selection - all fields):') if ch1 == 's': print 'Please input 0 for fields you would like to exclude, any other input would include it.' for key in variable_lookup: if key[0] == 'class': continue ch2 = raw_input(key[1] + ':') if ch2 == '0': dont_include[key[0]] = 0 if ds_cur == None: conn = MongoClient('mongodb://localhost:27017') dataset = conn['rmpdb']['dataset_profs_ten_over'] ds_cur = dataset.find(filter = {}, projection = dont_include) dataset2 = conn['rmpdb']['dataset_profs_five_over_less_ten'] ds_cur2 = dataset2.find(filter = {}, projection = dont_include) X = [] # Variables Y = [] # Classes ids = [] # Keep track of professor IDs X2 = [] # Variables Y2 = [] # Classes ids2 = [] # Keep track of professor IDs print 'Building training set according to selection..' for row in tqdm(ds_cur): x_dict = dict() for key in row: if key == 'class': Y.append(int(row[key])) elif key == 'prof_id': ids.append(row[key]) elif isNan(row[key]): continue else: x_dict[int(key)] = float(row[key]) X.append(x_dict) for row in tqdm(ds_cur2): x_dict2 = dict() for key in row: if key == 'class': Y2.append(int(row[key])) elif key == 'prof_id': ids2.append(row[key]) elif isNan(row[key]): continue else: x_dict2[int(key)] = float(row[key]) X2.append(x_dict2) ch = raw_input('Include top words for males and females as features? (y/n) [n]: ') if ch == 'y': from glob import glob from json import loads vec_files = glob('../logs/*.vec') if not len(vec_files) == 0: print 'Word vector files found in ../logs: \n' print vec_files fch = raw_input('Enter name of file without extension. [../logs/trial0.vec] Enter 0 to skip. ../logs/') if fch == '0': male_vector, female_vector = build_vector() else: try: f = open('../logs/' + fch + '.vec', 'r') male_vector, female_vector = loads(f.read()) except: f = open('../logs/trial0.vec', 'r') male_vector, female_vector = loads(f.read()) else: male_vector, female_vector = build_vector() print 'Male vectors as (word, count)' print male_vector print "=============================================" print 'Female vectors as (word, count)' print female_vector print "=============================================" print 'Calculating word features for all professors in dataset. This shall take some time.' print 'Depending on your cutoff, this can take from 4 - 6 hours. Probably a good idea to get some other stuff done..' male_words = [tup[0] for tup in male_vector] female_words = [tup[0] for tup in female_vector] union_words = list(set(male_words).union(set(female_words))) final_words = list() print 'Select words you want to remove by entering "x".' for word in union_words: wch = raw_input(word + ':') if wch == 'x': continue else: final_words.append(word) from string import punctuation exclude = set(punctuation) rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] for i in tqdm(range(len(ids))): prof_id = ids[i] # male_dict = dict() # female_dict = dict() # for tup in male_vector: # male_dict[tup[0]] = 0 # for tup in female_vector: # female_dict[tup[0]] = 0 vec_dict = dict() for word in final_words: vec_dict[word] = 0 prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1}) for comment in prof_comments['all comments']: text = comment['rComments'] no_punc_text = ''.join(ch for ch in text if ch not in exclude) toks = no_punc_text.split() for tok in toks: # if tok.lower() in male_dict: # male_dict[tok.lower()] += 1 # if tok.lower() in female_dict: # female_dict[tok.lower()] += 1 if tok.lower() in vec_dict: vec_dict[tok.lower()] += 1 feature_counter = 53 #starts right after variable_lookup['53'] # for j in range(len(male_vector)): # feature_counter += 1 # tup = male_vector[j] # if not male_dict[tup[0]] == 0: # X[i][feature_counter] = male_dict[tup[0]] # for j in range(len(female_vector)): # feature_counter += 1 # tup = female_vector[j] # if not female_dict[tup[0]] == 0: # X[i][feature_counter] = female_dict[tup[0]] for j in range(len(final_words)): feature_counter += 1 word = final_words[j] if not vec_dict[word] == 0: X[i][feature_counter] = vec_dict[word] # if feature_counter == 97: # break print "Building test set.." for i in tqdm(range(len(ids2))): prof_id = ids2[i] # male_dict = dict() # female_dict = dict() # for tup in male_vector: # male_dict[tup[0]] = 0 # for tup in female_vector: # female_dict[tup[0]] = 0 vec_dict = dict() for word in final_words: vec_dict[word] = 0 prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1}) for comment in prof_comments['all comments']: text = comment['rComments'] no_punc_text = ''.join(ch for ch in text if ch not in exclude) toks = no_punc_text.split() for tok in toks: # if tok.lower() in male_dict: # male_dict[tok.lower()] += 1 # if tok.lower() in female_dict: # female_dict[tok.lower()] += 1 if tok.lower() in vec_dict: vec_dict[tok.lower()] += 1 feature_counter = 53 #starts right after variable_lookup['53'] # for j in range(len(male_vector)): # feature_counter += 1 # tup = male_vector[j] # if not male_dict[tup[0]] == 0: # X[i][feature_counter] = male_dict[tup[0]] # for j in range(len(female_vector)): # feature_counter += 1 # tup = female_vector[j] # if not female_dict[tup[0]] == 0: # X[i][feature_counter] = female_dict[tup[0]] for j in range(len(final_words)): feature_counter += 1 word = final_words[j] if not vec_dict[word] == 0: X2[i][feature_counter] = vec_dict[word] print 'Words used:' print final_words else: pass print 'Writing temp files for AUC calculation..' build_svm_file(X, Y) print 'Temp file written..' print 'Features used:' fstr = list() for key in variable_lookup: if key[0] in dont_include or key[0] == 'class': continue else: fstr.append(key[1]) print dumps(fstr) print '======================================\n' prob = problem(Y, X) param = parameter('-s 6 -v 10') m = train(prob, param) print 'Evaluating..\n' system('liblinear-2.1/train -s 6 -v 10 liblinear-2.1/temp_ds') model = train(prob, parameter('-s 6 -q')) #system('rm liblinear-2.1/temp_ds') print 'Testing model on test set..' p_Y2, p_acc, p_vals = predict(Y2, X2, model) contingency_mat = [[0, 0], [0, 0]] for i in range(len(Y2)): if (Y2[i] == 0) and (p_Y2[i] == 0): contingency_mat[0][0] += 1 elif (Y2[i] == 0) and (p_Y2[i] == 1): contingency_mat[0][1] += 1 elif (Y2[i] == 1) and (p_Y2[i] == 0): contingency_mat[1][0] += 1 else: contingency_mat[1][1] += 1 return (model, p_acc, contingency_mat)
def train(y, x, params): """ Trains on y,x and returns the model """ print "Training on data of size: ", len(y) m = llu.train(y,x, params) return m
#coding: utf-8 import liblinearutil import outputLIBSVMformat train_label, train_data = liblinearutil.svm_read_problem("./train_libsvmFormat.txt") #カーネル関数は線型 model = liblinearutil.train(train_label, train_data, "-s 3") test_label, test_data = liblinearutil.svm_read_problem("./test_libsvmFormat.txt") p_label, p_acc, p_val = liblinearutil.predict(test_label, test_data, model)
def train(instance_file, model_file, param): y, x = ll.svm_read_problem(instance_file) prob = ll.problem(y, x) m = ll.train(prob, param) ll.save_model(model_file, m) print 'done training', model_file
def train(self): sys.stderr.write('creating training problem...') prob = problem(self.labels, self.contexts) sys.stderr.write('done\ntraining with option(s) "'+self.parameters+'"...') self.model = train(prob, parameter(self.parameters)) sys.stderr.write('done\n')
def simpleLibLinear(X_train,Y_train): prob = ll.problem(Y_train,X_train) param = ll.parameter('-c '+str(c)) m = ll.train(prob, param) return m