class CRFTrainer(object): def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type') def load_data(self): letters = load_letters() X, y, folds = letters['data'], letters['labels'], letters['folds'] X, y = np.array(X), np.array(y) return X, y, folds # X is a numpy array of samples where each sample # has the shape (n_letters, n_features) def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) # Run the classifier on input data def classify(self, input_data): return self.clf.predict(input_data)[0]
def fit_predict(train_docs, test_docs, dataset, C, class_weight, constraints, compat_features, second_order, coparents, grandparents, siblings, exact_test=False): stats = stats_train(train_docs) prop_vect, _ = prop_vectorizer(train_docs, which=dataset, stats=stats, n_most_common_tok=None, n_most_common_dep=2000, return_transf=True) link_vect = link_vectorizer(train_docs, stats, n_most_common=500) sec_ord_vect = (second_order_vectorizer(train_docs) if second_order else None) _, _, _, pmi_in, pmi_out = stats def _transform_x_y(docs): X = [ _vectorize(doc, pmi_in, pmi_out, prop_vect, link_vect, sec_ord_vect) for doc in docs ] Y = [doc.label for doc in docs] return X, Y X_tr, Y_tr = _transform_x_y(train_docs) X_te, Y_te = _transform_x_y(test_docs) model = ArgumentGraphCRF(class_weight=class_weight, constraints=constraints, compat_features=compat_features, coparents=coparents, grandparents=grandparents, siblings=siblings) clf = FrankWolfeSSVM(model, C=C, random_state=0, verbose=1, check_dual_every=25, show_loss_every=25, max_iter=100, tol=0) clf.fit(X_tr, Y_tr) if exact_test: clf.model.exact = True Y_pred = clf.predict(X_te) return clf, Y_te, Y_pred
class CRFTrainer(object): def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type') def load_data(self): letters = load_letters() X, y, folds = letters['data'], letters['labels'], letters['folds'] X, y = np.array(X), np.array(y) return X, y, folds # X是一个由样本组成的numpy数组,每个样本为(字母,数值) def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) # 对输入数据运行分类器 def classify(self, input_data): return self.clf.predict(input_data)[0]
def structraining(self, bags, mentions, retweets, labels): total_datas = [] total_labels = [] print('num_user', len(bags.keys())) for user_id, bag in bags.items(): if not user_id in labels: continue features = np.empty((0, self.top_seq)) edge_nodes = np.empty((0, 2)) edge_features = np.empty((0, 1)) clique_labels = np.array([labels[user_id]]) features = np.vstack([features, bag]) mentioned_ids = mentions[user_id] cnt = 0 for mentioned_id in enumerate(mentioned_ids): if not mentioned_id in labels: continue clique_labels = np.append(clique_labels, np.array([labels[mentioned_id]])) if mentioned_id in bags: features = np.vstack([features, bags[mentioned_id]]) else: features = np.vstack([features, np.zeros(self.top_seq)]) edge_nodes = np.vstack([edge_nodes, np.array([0, cnt + 1])]) edge_features = np.vstack([edge_features, np.array([[0]])]) cnt += 1 num_mentioned = edge_nodes.shape[0] retweet_ids = retweets[user_id] cnt = 0 for retweet_id in retweet_ids: if not retweet_id in labels: continue clique_labels = np.append(clique_labels, np.array([labels[retweet_id]])) if retweet_id in bags: features = np.vstack([features, bags[retweet_id]]) else: features = np.vstack([features, np.zeros(self.top_seq)]) edge_nodes = np.vstack( [edge_nodes, np.array([0, cnt + 1 + num_mentioned])]) edge_features = np.vstack([edge_features, np.array([[1]])]) cnt += 1 total_datas.append( (features, edge_nodes.astype(int), edge_features)) total_labels.append(clique_labels) ratio = len(total_datas) * 0.7 ratio = int(ratio) print(ratio) X_train, y_train = total_datas[:ratio], total_labels[:ratio] X_test, y_test = total_datas[ratio:], total_labels[ratio:] model = EdgeFeatureGraphCRF(inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10) ssvm.fit(X_train, y_train) result = ssvm.score(X_test, y_test) print(result)
def n_cross_valid_crf(X, Y, K, command): # cross validation for crf if command == 'write_results': list_write = list() cv = KFold(len(X), K, shuffle=True, random_state=0) for traincv, testcv in cv: x_train, x_test = X[traincv], X[testcv] y_train, y_test = Y[traincv], Y[testcv] crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(x_train, y_train) y_pred = ssvm.predict(x_test) print 'Accuracy of linear-crf %f:' % ssvm.score(x_test, y_test) if command == 'metrics_F1': metrics_crf(y_test, y_pred) elif command == 'confusion_matrix': confusion_matrix_CRF(y_test, y_pred) elif command == 'write_results': list_write += write_results_CRF(testcv, y_test, y_pred) print '------------------------------------------------------' print '------------------------------------------------------' if command == 'write_results': list_write = sorted(list_write, key=itemgetter(0)) # sorted list based on index for value in list_write: pred_list = value[1] test_list = value[2] for i in range(0, len(pred_list)): print str(pred_list[i]) + '\t' + str(test_list[i])
def pick_best_C_value(train_sentences, sentence_labels, test_SF, test_sentences, test_sentence_labels): i = 0.10 best_C = i f_old = 0 for z in range(1, 20): print "----------------- Training on C-value %f" % i modelCRF = ChainCRF() ssvm = FrankWolfeSSVM(model=modelCRF, C=i, max_iter=20, random_state=5) ssvm.fit(train_sentences, sentence_labels) print "\n" print "-------- Training complete --------" predictions = ssvm.predict(test_sentences) test_SF['predicted_labels'] = predictions #Saving model print "Saving model...." pickle.dump(ssvm, open('models/ote/otemodel.sav', 'wb')) #Evaluating Trained CRF model p, r, f1, common, retrieved, relevant = evaluating_ote(test_SF) if (f1 >= f_old): #save value of 'C' f_old = f1 best_C = i i = i + 0.05 return best_C
def train(trainSetX, trainSetY, testSetX, testSetY): modelLogger = SaveLogger('imagesegmentation-horse-hog_96_lbp_test.model', save_every=1) # Load trained CRF model print 'Loading trained model for CRF' #clf = modelLogger.load() # Uncomment if we want to train from scratch first layer CRF print 'Training CRF...' start_time = time.time() crf = EdgeFeatureGraphCRF() #antisymmetric_edge_features=[1,2] clf = FrankWolfeSSVM(model=crf, C=10., tol=.1, verbose=3, show_loss_every=1, logger=modelLogger) # #max_iter=50 ##clf = OneSlackSSVM(model=crf, verbose=1, show_loss_every=1, logger=modelLogger) clf.fit(numpy.array(trainSetX), numpy.array(trainSetY)) print 'Training CRF took ' + str(time.time() - start_time) + ' seconds' #print("Overall super pixelwise accuracy (training set): %f" % clf.score(numpy.array(trainSetX), numpy.array(trainSetY) )) #print("Overall super pixelwise accuracy (test set): %f" % clf.score(numpy.array(testSetX), numpy.array(testSetY) )) print 'SUPERPIXELWISE ACCURACY' print '-----------------------------------------------------------------------' print '' print 'TRAINING SET RESULTS' train_ypred = evaluatePerformance(clf, numpy.array(trainSetX), numpy.array(trainSetY)) print '' print 'TEST SET RESULTS' evaluatePerformance(clf, numpy.array(testSetX), numpy.array(testSetY)) print '-----------------------------------------------------------------------'
class CRFModel(object): def __init__(self, c_val=1.0): self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=50) def load_data(self): alphabets = load_letters() X = np.array(alphabets['data']) y = np.array(alphabets['labels']) folds = alphabets['folds'] return X, y, folds def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) def classify(self, input_data): return self.clf.predict(input_data)[0] def convert_to_letters(indices): alphabets = np.array(list(string.ascii_lowercase)) output = np.take(alphabets, indices) output = ''.join(output) return output
def train_SSVM(X_train, y_train): #print X_train.shape, X_train[0].shape # splitting the 8 sub-arrays into further: #X_train = np.concatenate([np.array_split(x, 100) for x in X_train]) #y_train = np.concatenate([np.array_split(y, 100) for y in y_train]) #X_test = np.concatenate([np.array_split(x, 30) for x in X_test]) #y_test = np.concatenate([np.array_split(y, 30) for y in y_test]) #print X_train.shape #print X_train[0].shape #print y_train[0].shape #exit() #Train using linear chain CRF #https://groups.google.com/forum/#!topic/pystruct/KIkF7fzCyDI model = ChainCRF() #ssvm = NSlackSSVM(model=model, C=.1, max_iter=11) # almost similar to FrankWolfeSSVM ssvm = FrankWolfeSSVM(model=model, C=0.001, max_iter=11) # c=0.2 -> 62.86 % accuracy <==> c=0.1 #ssvm = OneSlackSSVM(model=model) #doesn't work as well ssvm.fit(X_train, y_train) print "Learning complete..." return ssvm
class CRFTrainer(object): #define an init function to initialize the values. def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name #using chain crf to analyze the data, so add an error check for this: if self.classifier_name == 'ChainCRF': model = ChainCRF() #define the classifier to use with CRF model. self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=100) else: raise TypeError('Invalid classifier type') def load_clean_data(self): ''' load the data into X and y, where X is a numpy array of samples where each sample has the shape (n_letters, n_features) ''' df = featurize.get_data() featurize.split_words(df) featurize.first_letter_uppercase(df) featurize.has_number(df) featurize.has_slash(df) featurize.spacy_pos_tagger(df) featurize.pos_ngrams(df) featurize.encoding_labels(df) X, y = featurize.get_X_and_y(df) return df, X, y def cross_val(self, X_train, y_train): ''' method to conduct 5-fold cross validation ''' kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False) for train_idx, test_idx in kf: xtrain, xval = X_train[train_idx], X_train[test_idx] ytrain, yval = y_train[train_idx], y_train[test_idx] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15) ssvm.fit(xtrain, ytrain) print ssvm.score(xval, yval) def train(self, X_train, y_train): ''' training method ''' self.clf.fit(X_train, y_train) def evaluate(self, X_test, y_test): ''' method to evaluate the performance of the model ''' return self.clf.score(X_test, y_test) def classify(self, input_data): ''' method to run the classifier on input data ''' return self.clf.predict(input_data)[0]
def create_crf(self): """ :return: """ # to load nltk tagger, a time consuming, one time needed operation self.nltk_tagger = nltk.tag._get_tagger() self.crf = FrankWolfeSSVM(model=ChainCRF(), C=1.0, max_iter=50) self.X, self.y, self.label_code, self.folds, generate_fold = self.load_training_data( ) score = 0 # only need to iterate through if fold was generated num_tries = 10 if generate_fold else 1 while (score <= 0.90) and (num_tries > 0): try: X_train, y_train = self.get_train_data() self.train(X_train, y_train) X_test, y_test = self.get_test_data() score = self.evaluate(X_test, y_test) except Exception as e: current_app.logger.error('Exception: %s' % (str(e))) current_app.logger.error(traceback.format_exc()) pass num_tries -= 1 return (score > 0)
def test_multinomial_blocks_frankwolfe_batch(): X, Y = generate_blocks_multinomial(n_samples=10, noise=0.3, seed=0) crf = GridCRF(inference_method='qpbo') clf = FrankWolfeSSVM(model=crf, C=1, max_iter=500, batch_mode=True) clf.fit(X, Y) Y_pred = clf.predict(X) assert_array_equal(Y, Y_pred)
def test_multinomial_blocks_frankwolfe(): X, Y = generate_blocks_multinomial(n_samples=10, noise=0.5, seed=0) crf = GridCRF(inference_method='qpbo') clf = FrankWolfeSSVM(model=crf, C=1, max_iter=50, verbose=3) clf.fit(X, Y) Y_pred = clf.predict(X) assert_array_equal(Y, Y_pred)
def scope_trainer(sentence_dicts): scope_instances, scope_labels, sentence_splits = extract_features_scope(sentence_dicts, 'training') scope_vec = DictVectorizer() fvs = scope_vec.fit_transform(scope_instances).toarray() X_train, y_train = split_data(fvs, scope_labels, sentence_splits) scope_ssvm = FrankWolfeSSVM(model=ChainCRF(), C=0.20, max_iter=10) scope_ssvm.fit(X_train, y_train) return scope_ssvm, scope_vec
def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name if self.classifier_name == 'ChainCRF': model = ChainCRF() self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) else: raise TypeError('Invalid classifier type')
def test_multinomial_blocks_frankwolfe(): X, Y = generate_blocks_multinomial(n_samples=50, noise=0.5, seed=0) crf = GridCRF(inference_method='qpbo') clf = FrankWolfeSSVM(model=crf, C=1, line_search=True, batch_mode=False, check_dual_every=500) clf.fit(X, Y) Y_pred = clf.predict(X) assert_array_equal(Y, Y_pred)
def main(): parser = argparse.ArgumentParser( description="learn to segment and tokenize (really, any labeling)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--untokfile", "-u", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="untok file") parser.add_argument( "--biofile", "-b", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="bio file. must match untok file and be space separated") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output file") parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) untokfile = prepfile(args.untokfile, 'r') biofile = prepfile(args.biofile, 'r') data, labels, datamap, labelmap = prepdata(untokfile, biofile, args.debug) # print(data) # print(labels) model = ChainCRF() #ssvm = SubgradientSSVM(model=model, C=.1)#, show_loss_every=5) ssvm = FrankWolfeSSVM(model=model, max_iter=100, C=.1) #, show_loss_every=5) ssvm.fit(data, labels) # curve = ssvm.loss_curve_ # TONT # print("TONT score with chain CRF: %f" % ssvm.score(data, labels)) ret = {} ret['model'] = ssvm ret['feats'] = datamap ret['labels'] = labelmap if args.outfile is not None: pickle.dump(ret, args.outfile)
def train_scope_learner(sentence_dicts, C_value): scope_sentence_dicts, scope_instances, scope_labels, sentence_splits = extract_features_scope( sentence_dicts, 'training') vectorizer = DictVectorizer() fvs = vectorizer.fit_transform(scope_instances).toarray() X_train, y_train = make_splits(fvs, scope_labels, sentence_splits) model = ChainCRF() scope_ssvm = FrankWolfeSSVM(model=model, C=C_value, max_iter=10) scope_ssvm.fit(X_train, y_train) return scope_ssvm, vectorizer
def __init__(self, c_value, classifier_name='ChainCRF'): self.c_value = c_value self.classifier_name = classifier_name #using chain crf to analyze the data, so add an error check for this: if self.classifier_name == 'ChainCRF': model = ChainCRF() #define the classifier to use with CRF model. self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=100) else: raise TypeError('Invalid classifier type')
def __init__(self): self.classifierMNB = Pipeline([ #Multinomial Naive Bayes ('extract', ExtractFeatures()), #('encoding', MultiColumnLabelEncoder()), ('clf', MultinomialNB(alpha=0.5)) ]) # self.classifierMaxEnt = Pipeline([ # ('extract', ExtractFeatures()), # #('encoding', MultiColumnLabelEncoder()), # ('clf', nltk.maxent.MaxentClassifier.train(x, algorithm = 'gis', trace = 0, max_iter = 10)) # ]) self.classifierMaxEnt_LogReg = Pipeline([ #Maximum Entropy ('extract', ExtractFeatures()), ('clf', linear_model.LogisticRegression()) ]) self.classifierCRF = Pipeline([ #CRF ('extract', ExtractFeaturesToArray()), ('clf', FrankWolfeSSVM(model=ChainCRF(), C=2, max_iter=10, tol=0.01)) ]) self.classifierSVM = Pipeline([ #Support Vector Machine ('extract', ExtractFeatures()), ('clf', svm.LinearSVC()) ]) pass
def graph_crf(): crf = GraphCRF() # X_train # creating features # maximum number of attributes = 2 # variables have only one attribute (assigned value), so other second attribute is set to zero feature_1 = [1, 0] # var_1 feature_2 = [2, 0] # var_2 # function has two attributes, so an indicator variable is used to show those two feature_3 = [1, 1] # function # if has only one condition, which checks for value 1 feature_4 = [1, 0] # if features = np.array([feature_1, feature_2, feature_3, feature_4]) # creating edges # there are four edges: (v1, v2), (v1, func), (v2, func), (v1, if) edge_1 = [0, 1] # (v1,v2) edge_2 = [0, 2] # (v1, func) edge_3 = [1, 2] # (v2, func) edge_4 = [0, 3] # (v1, if) edges = np.array([edge_1, edge_2, edge_3, edge_4]) X_train_sample = (features, edges) # y_train # These are enumerated values for actions # We assume there should be an action for each node(variable, function, if, etc.) y_train_sample = np.array([0, 0, 1, 2]) # creat some full training set by re-sampling above thing n_samples = 100 X_train = [] y_train = [] for i in range(n_samples): X_train.append(X_train_sample) y_train.append(y_train_sample) model = GraphCRF(directed=True, inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train, y_train) # predict something output = ssvm.predict(X_train[0:3]) print output
def main(): parser = argparse.ArgumentParser( description="learn to segment and tokenize (really, any labeling)", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--untokfile", "-u", nargs="?", type=argparse.FileType("r"), default=sys.stdin, help="untok file" ) parser.add_argument( "--biofile", "-b", nargs="?", type=argparse.FileType("r"), default=sys.stdin, help="bio file. must match untok file and be space separated", ) parser.add_argument("--outfile", "-o", nargs="?", type=argparse.FileType("wb"), default=None, help="output file") parser.add_argument("--debug", "-d", action="store_true", default=False, help="debug mode") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) untokfile = prepfile(args.untokfile, "r") biofile = prepfile(args.biofile, "r") data, labels, datamap, labelmap = prepdata(untokfile, biofile, args.debug) # print(data) # print(labels) model = ChainCRF() # ssvm = SubgradientSSVM(model=model, C=.1)#, show_loss_every=5) ssvm = FrankWolfeSSVM(model=model, max_iter=100, C=0.1) # , show_loss_every=5) ssvm.fit(data, labels) # curve = ssvm.loss_curve_ # TONT # print("TONT score with chain CRF: %f" % ssvm.score(data, labels)) ret = {} ret["model"] = ssvm ret["feats"] = datamap ret["labels"] = labelmap if args.outfile is not None: pickle.dump(ret, args.outfile)
def test_svm_as_crf_pickling_bcfw(): iris = load_iris() X, y = iris.data, iris.target X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X] Y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1) _, file_name = mkstemp() pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary') logger = SaveLogger(file_name) svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50) svm.fit(X_train, y_train) assert_less(.97, svm.score(X_test, y_test)) assert_less(.97, logger.load().score(X_test, y_test))
def test_svm_as_crf_pickling_batch(): iris = load_iris() X, y = iris.data, iris.target X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X] Y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1) _, file_name = mkstemp() pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary') logger = SaveLogger(file_name) svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50, batch_mode=False) svm.fit(X_train, y_train) assert_less(.97, svm.score(X_test, y_test)) assert_less(.97, logger.load().score(X_test, y_test))
def model_test(k, head, tail): """ CRF训练和预测 """ each_fold_time = time.time() #开始计时 #divide train set and test set train_id = dataId[head:tail] test_id = dataId[:head] + dataId[tail:] X_train = X_arr[train_id, :] Y_train = Y_arr[train_id] X_test = X_arr[test_id, :] Y_test = Y_arr[test_id] campTest = Camp_arr[test_id] #ends divide train set and test set if len(X_train) > 0: #实例化CRF EFGCRF = EdgeFeatureGraphCRF(inference_method='qpbo', class_weight=CLASS_WEIGHT) if LEARNER == "OneSlackSSVM": #利用OneSlackSSVM训练模型参数 ssvm = OneSlackSSVM(EFGCRF, C=.1, tol=.1, max_iter=100, switch_to='ad3') elif LEARNER == "FrankWolfeSSVM": #利用FrankWolfeSSVM训练模型参数 ssvm = FrankWolfeSSVM(EFGCRF, C=.1, tol=.1, max_iter=100) else: #没有选择分类器退出 pass ssvm.fit(X_train, Y_train) Y_pred = ssvm.predict(X_test) df_result = statistic_result(Y_pred, Y_test, campTest) V_precision = precision_score(df_result["label"], df_result["pred"]) V_recall = recall_score(df_result["label"], df_result["pred"]) V_f1 = f1_score(df_result["label"], df_result["pred"]) camps_pred, camps_lbl = statistic_campaign_result(Y_pred, Y_test) C_precision = precision_score(camps_lbl, camps_pred) C_recall = recall_score(camps_lbl, camps_pred) C_f1 = f1_score(camps_lbl, camps_pred) result_Queue.put( [V_precision, V_recall, V_f1, C_precision, C_recall, C_f1]) else: print("TRAIN SET is NULL") print("the {}th fold using time: {:.4f} min".format( k + 1, (time.time() - each_fold_time) / 60)) del X_train, Y_train, X_test, Y_test, Y_pred, campTest
def n_cross_valid_crf_candidate(list_line, X, Y, K): list_text = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') list_text.append(split_first) list_text = np.array(list_text) cv = KFold(len(X), K, shuffle=True, random_state=0) list_write = [] for traincv, testcv in cv: x_train, x_test = X[traincv], X[testcv] y_train, y_test = Y[traincv], Y[testcv] list_text_train, list_text_test = list_text[traincv], list_text[testcv] crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=10) ssvm.fit(x_train, y_train) y_pred = ssvm.predict(x_test) list_wrong = metrics_crf_candidate(list_text_test, y_test, y_pred) if len(list_write) == 0: list_write = list_wrong else: for i in range(0, len(list_wrong)): svc = list_wrong[0] road = list_wrong[1] busstop = list_wrong[2] list_write[0] = list_write[0] + svc list_write[1] = list_write[1] + road list_write[2] = list_write[2] + busstop # write_file('d:/', 'wrong_svc', list_write[0]) # write_file('d:/', 'wrong_road', list_write[1]) # write_file('d:/', 'wrong_busstop', list_write[2]) write_file('d:/', 'good_svc', list_write[0]) write_file('d:/', 'good_road', list_write[1]) write_file('d:/', 'good_busstop', list_write[2])
def MLfitCRF(data_train, data_test, records, folds): fvector = np.array([data_train[0]]) labels = np.array([data_train[1]]) #create CRF model CRFmodel = ChainCRF() #create ML classifier ssvm = FrankWolfeSSVM(model = CRFmodel, C = 0.1) #training ssvm.fit(fvector, labels) #model testing fvector_test = np.array(data_test[0]) labels_test = np.array(data_test[1]) score = ssvm.score(fvector_train, labels_test) print score return
def results_CRFs(X_training, Y_training, X_testing, Y_testing, command): crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(X_training, Y_training) y_pred = ssvm.predict(X_testing) list_write = list() print 'Accuracy of linear-crf %f:' % ssvm.score(X_testing, Y_testing) if command == 'metrics_F1': metrics_crf(Y_testing, y_pred) elif command == 'confusion_matrix': confusion_matrix_CRF(Y_testing, y_pred) elif command == 'write_results': list_write = write_CRFs_compare(Y_testing, y_pred) for value in list_write: pred_list = value[0] test_list = value[1] for i in range(0, len(pred_list)): print str(pred_list[i]) + '\t' + str(test_list[i])
def chaincrf_test(): num_pics = 3000 X, Y = load_pictures(num_pics) X = np.array(X) Y = np.array(Y) print X.shape print Y.shape # 0: pixel, 1: row, 2: picture mode = 0 outstr = "Test score with data arranged by " if mode == 0: X, Y = arrange_by_pixel(X, Y) outstr += "pixel:" elif mode == 1: X, Y = arrange_by_row(X, Y) outstr += "row:" elif mode == 2: X, Y = arrange_by_picture(X, Y) outstr += "picture:" print X.shape print Y.shape #print X.shape, Y.shape train_pct = 0.66 test_pct = 1 - train_pct X_train = X[0:math.floor(train_pct * num_pics)] X_test = X[math.floor(test_pct * num_pics):] Y_train = Y[0:math.floor(train_pct * num_pics)] Y_test = Y[math.floor(test_pct * num_pics):] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) # #print X_train.shape, Y_train.shape ssvm.fit(X_train, Y_train) results = ssvm.score(X_test, Y_test) print outstr print results
def chaincrf_test(): num_pics = 3000 X, Y= load_pictures(num_pics) X = np.array(X) Y = np.array(Y) print X.shape print Y.shape # 0: pixel, 1: row, 2: picture mode = 0 outstr = "Test score with data arranged by " if mode == 0: X, Y = arrange_by_pixel(X, Y) outstr += "pixel:" elif mode == 1: X, Y = arrange_by_row(X, Y) outstr += "row:" elif mode == 2: X, Y = arrange_by_picture(X, Y) outstr += "picture:" print X.shape print Y.shape #print X.shape, Y.shape train_pct = 0.66 test_pct = 1 - train_pct X_train = X[0:math.floor(train_pct * num_pics)] X_test = X[math.floor(test_pct*num_pics):] Y_train = Y[0:math.floor(train_pct * num_pics)] Y_test = Y[math.floor(test_pct*num_pics):] model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) # #print X_train.shape, Y_train.shape ssvm.fit(X_train, Y_train) results = ssvm.score(X_test, Y_test) print outstr print results
def learn(train_set): X = [] y = [] for num in train_set: X += get_features_value(num) y += get_segments_classes(num) X = np.array(X) X = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X] y = np.vstack(y) pbl = GraphCRF(inference_method='unary') #svm = NSlackSSVM(pbl, C=100) svm = FrankWolfeSSVM(pbl, C=10, max_iter=50) svm.fit(X, y) cPickle.dump(svm, open("classifier", "wb+")) return svm
def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()}
def Chain_CRF(x, y, x_test, model_args): # Reshape for CRF #svc = SVC(class_weight='balanced', kernel='rbf', decision_function_shape='ovr') #svc.fit(x, y) #x = svc.decision_function(x) #x_test = svc.decision_function(x_test) #scaler = StandardScaler().fit(x) #x = scaler.transform(x) #x_test = scaler.transform(x_test) x = x[:, :11] x_test = x_test[:, :11] x = x.reshape(-1, 21600, x.shape[-1]) x_test = x_test.reshape(-1, 21600, x.shape[-1]) y = y.reshape(-1, 21600) crf = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=crf, C=model_args['C'], max_iter=model_args['max_iter']) ssvm.fit(x, y) y_pred = np.array(ssvm.predict(x_test)) return y_pred.flatten()
def chain_crf(): letters = load_letters() x, y, folds = letters['data'], letters['labels'], letters['folds'] print "Letters : " print letters # print "Data : " # print letters['data'] # print "Labels : " # print letters['labels'] x, y = np.array(x), np.array(y) x_train, x_test = x[folds == 1], x[folds != 1] y_train, y_test = y[folds == 1], y[folds != 1] print len(x_train) print len(x_test) print "Done" print x_train[0].shape print y_train[0].shape print x_train[10].shape print y_train[10].shape model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) print ssvm.fit(x_train, y_train) print ssvm.score(x_test, y_test)
def test_ssvm_objectives(): # test that the algorithms provide consistent objective curves. # this is not that strong a test now but at least makes sure that # the objective function is called. X, Y = generate_blocks_multinomial(n_samples=10, noise=1.5, seed=0) n_labels = len(np.unique(Y)) crf = GridCRF(n_states=n_labels, inference_method=inference_method) # once for n-slack clf = NSlackSSVM(model=crf, max_iter=5, C=1, tol=.1) clf.fit(X, Y) primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C) assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective) # once for one-slack clf = OneSlackSSVM(model=crf, max_iter=5, C=1, tol=.1) clf.fit(X, Y) primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C, variant='one_slack') assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective) # now subgradient. Should also work in batch-mode. clf = SubgradientSSVM(model=crf, max_iter=5, C=1, batch_size=-1) clf.fit(X, Y) primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C) assert_almost_equal(clf.objective_curve_[-1], primal_objective) # frank wolfe clf = FrankWolfeSSVM(model=crf, max_iter=5, C=1, batch_mode=True) clf.fit(X, Y) primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C) assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective) # block-coordinate Frank-Wolfe clf = FrankWolfeSSVM(model=crf, max_iter=5, C=1, batch_mode=False) clf.fit(X, Y) primal_objective = objective_primal(clf.model, clf.w, X, Y, clf.C) assert_almost_equal(clf.primal_objective_curve_[-1], primal_objective)
def build_models(X_train, y_train): ''' PURPOSE: ouput model objects which have been fitted with training data INPUT: X_train (np.array) - features matrix y_train (np.array) - label matrix OUTPUT: nmb (MultinomialNB obj) - model trained on X_train, y_train svm (LinearSVC obj) - model trained on X_train, y_train ssvm (PyStruct chainCRF object) - trained Chain CRF model ''' # Multinomial Naive Bayes Classifier: nmb = MultinomialNB() nmb.fit(np.vstack(X_train), np.hstack(y_train)) # Support Vector Machine Classifier svm = LinearSVC(dual=False, C=.1) svm.fit(np.vstack(X_train), np.hstack(y_train)) # Chain Conditional Random Field Classifier model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15) ssvm.fit(X_train, y_train) return nmb, svm, ssvm
class CRFModel(object): def __init__(self, c_val=1.0): self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=100) #Load the training data def load_data(self): alphabets = load_letters() X = np.array(alphabets['data']) y = np.array(alphabets['labels']) folds = alphabets['folds'] return X, y, folds #Train the CRF def train(self, X_train, y_train): self.clf.fit(X_train, y_train) #Evaluate the accuracy of the CRF def evaluate(self, X_test, y_test): return self.clf.score(X_test, y_test) #Run the CRF on unknown data def classify(self, input_data): return self.clf.predict(input_data)[0]
def main(): parser = argparse.ArgumentParser(description="learn to tokenize", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--untokfile", "-u", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="untok file") parser.add_argument("--biofile", "-b", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="bio file") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output file") parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) untokfile = prepfile(args.untokfile, 'r') biofile = prepfile(args.biofile, 'r') data, labels, datamap, labelmap = prepdata(untokfile, biofile, args.debug) # print(data) # print(labels) model = ChainCRF() #ssvm = SubgradientSSVM(model=model, C=.1)#, show_loss_every=5) ssvm = FrankWolfeSSVM(model=model, max_iter=100, C=.1)#, show_loss_every=5) ssvm.fit(data, labels) # curve = ssvm.loss_curve_ # TONT # print("TONT score with chain CRF: %f" % ssvm.score(data, labels)) ret = {} ret['model']=ssvm ret['feats']=datamap ret['labels']=labelmap if args.outfile is not None: pickle.dump(ret, args.outfile)
def CRF_pred_label(X, Y, command): texts = load_demo_text(command) if command == 'twitter': convert_texts = filterText_demo(texts, 'removeLink', command) X_ftr = load_demo_ftr(command) print len(convert_texts), len(X_ftr) path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter' name_write = 'pred_label_' + command elif command == 'sgforums': convert_texts = filterText_demo(texts, 'removePunc', command) X_ftr = load_demo_ftr(command) print len(convert_texts), len(X_ftr) path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/sgforums' name_write = 'pred_label_' + command elif command == 'facebook': convert_texts = filterText_demo(texts, 'removeLink', command) X_ftr = load_demo_ftr(command) print len(convert_texts), len(X_ftr) path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/facebook' name_write = 'pred_label_' + command crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(X, Y) y_pred = ssvm.predict(X_ftr) list_write = list() for line in y_pred: labels = '' for label in line: labels += str(label) + '\t' list_write.append(labels.strip()) write_file(path_write, name_write, list_write)
def trainModel_Basic(num_iter=5,inference="qpbo",trainer="NSlack",num_train=2,num_test=1,C=0.1,edges="180x180_dist1_diag0",inputs=[1,1,1,1,1,1],features="all",directed=False,savePred=False): padding=(30,30,30,30) if directed==True: features +='+directed' resultsDir = os.getcwd()+'/CRFResults' nameLen = len(os.listdir(resultsDir)) edgeFeature = edges filename=str(nameLen)+'_CRF_iter_'+str(num_iter)+"_"+inference+"_"+trainer+"_"+features+"_"+str(num_train)+"_"+str(num_test)+"_"+edgeFeature print "Loading training slices" start = time.clock() train =extractSlices2(train_path,num_train,padding,inputs=inputs) end= time.clock() train_load_time = (end-start)/60.0 [trainLayers,trainTruth,sliceShape] = train print "Training slices loaded in %f" % (train_load_time) n_features= len(trainLayers[0][0,0]) print "Layer shape is : " print trainLayers[0].shape print "Training the model" edges= np.load("/home/bmi/CRF/edges/"+edges+".npy") G = [edges for x in trainLayers] print trainLayers[0].shape trainLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in trainLayers] ) trainTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in trainTruth] ) if inference=='ogm': crf = GraphCRF(inference_method=('ogm',{'alg':'fm'}),directed=directed) else: crf = GraphCRF(inference_method=inference,directed=directed) if trainer=="Frank": svm = FrankWolfeSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=6,verbose=1) elif trainer=="NSlack": svm = NSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1) else: svm = OneSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1) start = time.clock() asdf = zip(trainLayers,G) svm.fit(asdf,trainTruth) end = time.clock() train_time = (end-start)/60.0 print "The training took %f" % (train_time) print "Model parameter size :" print svm.w.shape print "making predictions on train data" predTrain = svm.predict(asdf) trainDice=[] for i in range(len(trainLayers)): diceScore = accuracy(predTrain[i],trainTruth[i]) trainDice.append(diceScore) meanTrainDice = sum(trainDice)/len(trainLayers) del trainLayers,trainTruth ################################################################################################ overallDicePerPatient=[] # For overall test Dice extDicePerPatient=[] PatientTruthLayers=[] PatientPredLayers=[] PREC=[] RECALL=[] F1=[] LayerwiseDiceTotal=[] testResultFile = open(os.getcwd()+"/CRFResults/"+filename+".csv",'a') testResultFile.write("folderName,numLayers, Overall Dice, precision , recall, F1"+"\n") counter=0 print "Loading the test slices" for folder in os.listdir(test_path): path = test_path + "/" + folder layerDiceScores='' # print path data = extractTestSlices2(path,padding,inputs=inputs) if data!=0: [testLayers,testTruth,sliceShape,startSlice,endSlice] = data # trueTestLayers=testLayers GTest = [edges for x in testLayers] testLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in testLayers] ) testTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in testTruth] ) asdfTest = zip(testLayers,GTest) predTest = svm.predict(asdfTest) LayerwiseDice=[] for i in range(len(testLayers)): diceScore = accuracy(predTest[i],testTruth[i]) layerDiceScores+=","+str(diceScore) if math.isnan(diceScore): if sum(predTest[i])==0 and sum(testTruth[i])==0: LayerwiseDice.append(1.0) continue LayerwiseDice.append(diceScore) LayerwiseDiceTotal.append(LayerwiseDice) overallTestDice = accuracy(np.hstack(predTest),np.hstack(testTruth)) extDice = np.mean ( np.array(LayerwiseDice)[ range(10) + range(len(LayerwiseDice)-10, len(LayerwiseDice)) ] ) prec,recall,f1 = precision_score(np.hstack(testTruth),np.hstack(predTest)) , recall_score(np.hstack(testTruth),np.hstack(predTest)) , f1_score(np.hstack(testTruth),np.hstack(predTest)) print "Patient %d : Overall test DICE for %s is : %f and extDice is %f"%(counter,folder,overallTestDice,extDice) print "Precision : %f Recall : %f F1 : %f " %(prec,recall,f1) print "__________________________________________" # testResultFile.write(folder+","+str(len(testLayers))+","+str(meanTestDice)+","+str(overallTestDice) ","+str(np.max(testDice)) +","+ str(np.min(testDice))+"\n" ) testResultFile.write(folder+","+str(len(testLayers)) + ","+ str(overallTestDice) + ","+str(prec)+","+str(recall)+","+str(extDice)+layerDiceScores+"\n" ) overallDicePerPatient.append(overallTestDice) extDicePerPatient.append(extDice) PREC.append(prec), RECALL.append(recall) , F1.append(f1) PatientTruthLayers.append(testTruth) PatientPredLayers.append(predTest) counter+=1 if counter==num_test and num_test!=-1: break ###################################################################################################### print "Done testing slices" overallDice = sum(overallDicePerPatient)/len(PatientTruthLayers) overallPrec = sum(PREC)/len(PatientTruthLayers) overallRecall = sum(RECALL)/len(PatientTruthLayers) overallExtDice = np.mean(extDicePerPatient) print "Overall DICE : %f Precision : %f Recall : %f extDice : %f "%(overallDice,overallPrec,overallRecall,overallExtDice) print "############################################" # testOutput=np.array([PatientPredLayers,PatientTruthLayers,trueTestLayers]) testOutput=np.array([PatientPredLayers,PatientTruthLayers]) ########### Saving the models ###################################################################### # print "Saving the model" # modelDir = os.getcwd()+"/CRFModel/" # svmModel = open(modelDir+filename+"_model"+".pkl",'wb') # cPickle.dump(svm,svmModel,protocol=cPickle.HIGHEST_PROTOCOL) # svmModel.close() # # print "saving the predictions" # predFileTest = open(os.getcwd()+"/CRFPred/"+filename+"_pred.pkl",'wb') # cPickle.dump(testOutput,predFileTest,protocol=cPickle.HIGHEST_PROTOCOL) # predFileTest.close() layerDataLog = open(os.getcwd()+"/CRFModel/"+filename+"_layer.pkl",'wb') cPickle.dump(LayerwiseDiceTotal,layerDataLog,protocol = cPickle.HIGHEST_PROTOCOL) layerDataLog.close() resultLog = os.getcwd()+"/CRFResults/TestResultFinal.csv" resultFile = open(resultLog,'a') resultFile.write(time.ctime()+","+str(num_iter)+","+str(num_train)+","+str(num_test)+","+inference+","+ trainer+","+str(C)+","+str(train_time)+","+str(meanTrainDice)+","+str(overallDice)+","+ str(np.std(overallDicePerPatient))+","+edgeFeature+","+"None"+","+features+","+filename +","+ str(overallPrec) +","+ str(overallRecall) +","+ str(overallExtDice)+","+"Flair(5)+T2(9)-Without last 4 train Layers"+"\n") resultFile.close() testResultFile.close() return
net_base_path = '/media/ohadsh/sheard/googleDrive/Master/courses/probabilistic_graphical_models/outputs/part_3/training_2016_06_11/' # Load pre-trained network train_name = 'train_pred_-1.pkl' test_name = 'test_pred_-1.pkl' with open(os.path.join(net_base_path, train_name), 'r') as f: train_net_pred = cPickle.load(f) with open(os.path.join(net_base_path, test_name), 'r') as f: test_net_pred = cPickle.load(f) # Rearrange data for CRF nn_predictions_train = arrange_letters_in_pred_like(X_train, train_net_pred, size_of_pred=26) nn_predictions_test = arrange_letters_in_pred_like(X_test, test_net_pred, size_of_pred=26) # Train LCCRF chain_model = ChainCRF(directed=True) chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11) chain_ssvm.fit(X_train, y_train) # Train LCCRF+NN chain_model = ChainCRF(directed=True) chain_ssvm_nn = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11) chain_ssvm_nn.fit(nn_predictions_train, y_train) print("Test score with linear NN: 84.15%") print("Test score with LCCRF: %f" % chain_ssvm.score(X_test, y_test)) print("Test score with LCCRF+NN: %f" % chain_ssvm_nn.score(nn_predictions_test, y_test)) # plot some word sequenced n_words = 4
import loader import util from sklearn import preprocessing directory = "/Users/thijs/dev/boilerplate/src/main/resources/dataset/" featureset = "features10" print("Load files") features, labels = \ loader.loadBinary(featureset+'.csv', 'labels.csv', directory) # print("Shuffle results") # features, labels = util.shuffle(features, labels) print("Loaded") # print(labels) # features = preprocessing.scale(features) from pystruct.models import BinaryClf from pystruct.learners import (NSlackSSVM, OneSlackSSVM, SubgradientSSVM, FrankWolfeSSVM) clf = FrankWolfeSSVM(BinaryClf(),verbose=True) # print(clf) clf.fit(features,labels) trscore = clf.score(features,labels) # print("Training score: {0}".format(trscore)) print("Klaar")
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _= pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
# break list_y.append(len(y[i])) print 'Shape of targets:', y.shape print 'Max length:', max(list_y) features_train, features_test = features[folds == 1], features[folds != 1] y_train, y_test = y[folds == 1], y[folds != 1] f_t = features_train X_train = [(features_i, np.vstack([np.arange(f_t.shape[0] - 1), np.arange(1, f_t.shape[0])])) for features_i in f_t] print 'Loading X_train' f_test = features_test X_test = [(features_i, np.vstack([np.arange(f_t.shape[0] - 1), np.arange(1, f_t.shape[0])])) for features_i in f_test] print 'Loading X_test' print len(X_train), len(y_train) print type(X_train), type(y_train) for each in X_train: print len(each) start = time() model = GraphCRF(directed=True, inference_method="max-product") ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) ssvm.fit(X_train, y_train) # # print 'accuracy of GraphCRF %f:' % ssvm.score(X_test, y_test), ' time spend: %f' % (time()-start)
net_base_path = '/media/ohadsh/sheard/googleDrive/Master/courses/probabilistic_graphical_models/outputs/part_3/training_2016_06_11/' # Load pre-trained network train_name = 'train_pred_-2.pkl' test_name = 'test_pred_-2.pkl' with open(os.path.join(net_base_path, train_name), 'r') as f: train_net_pred = cPickle.load(f) with open(os.path.join(net_base_path, test_name), 'r') as f: test_net_pred = cPickle.load(f) # Rearrange data for CRF nn_predictions_train = arrange_letters_in_pred_like(X_train, train_net_pred, size_of_pred=26) nn_predictions_test = arrange_letters_in_pred_like(X_test, test_net_pred, size_of_pred=26) # Train CRF model = ChainCRF(directed=True) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11) ssvm.fit(np.vstack(nn_predictions_train).reshape((5375, 1, 128)), np.hstack(y_train).reshape(5375, 1)) # Train linear chain CRF chain_model = ChainCRF(directed=True) chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11) chain_ssvm.fit(nn_predictions_train, y_train) # # Create linear regression object # regr = LinearRegression() # # Train the model using the training sets # regr.fit(np.vstack(nn_predictions_train), np.hstack(y_train)) # print("Test score with linear regression: %f" % regr.score(np.vstack(nn_predictions_test), # np.hstack(y_test)))
from pystruct.models import GridCRF from pystruct.learners import (NSlackSSVM, OneSlackSSVM, SubgradientSSVM, FrankWolfeSSVM) from pystruct.datasets import generate_crosses_explicit X, Y = generate_crosses_explicit(n_samples=50, noise=10, size=6, n_crosses=1) n_labels = len(np.unique(Y)) crf = GridCRF(n_states=n_labels, inference_method=("ad3", {'branch_and_bound': True})) n_slack_svm = NSlackSSVM(crf, check_constraints=False, max_iter=50, batch_size=1, tol=0.001) one_slack_svm = OneSlackSSVM(crf, check_constraints=False, max_iter=100, tol=0.001, inference_cache=50) subgradient_svm = SubgradientSSVM(crf, learning_rate=0.001, max_iter=20, decay_exponent=0, momentum=0) bcfw_svm = FrankWolfeSSVM(crf, max_iter=50, check_dual_every=4) #n-slack cutting plane ssvm n_slack_svm.fit(X, Y) # 1-slack cutting plane ssvm one_slack_svm.fit(X, Y) # online subgradient ssvm subgradient_svm.fit(X, Y) # Block coordinate Frank-Wolfe bcfw_svm.fit(X, Y) # don't plot objective from chached inference for 1-slack inference_run = ~np.array(one_slack_svm.cached_constraint_)
# print x # for value in x: # print value y = [0, 1, 1, 2, 2] y_1 = [0, 1, 1, 2, 2] # print y list_x, list_y = [], [] list_x.append(np.array(x)) list_x.append(np.array(x_1)) list_y.append(y) list_y.append(y_1) # crf = ChainCRF(inference_method='max-product') crf = ChainCRF(inference_method="max-product", directed=False) ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100) ssvm.fit(np.array(list_x), np.array(list_y)) test_x = np.array(list_x) test_y = np.array(list_y) # print np.array(list_x)[0].shape[1] x_test = [[1, 0, 0, 0], [1, 0, 1, 0]] list_x_test = list() list_x_test.append(x_test) pred = ssvm.predict(np.array(list_x_test)) # for value in pred: # print value
# for value in X: # print value.shape # # print X_train.shape # print y_train.shape # # print type(X_train) # for value in y_train: # print value # # for i in range(0, len(X_train)): # if i == 15: # print X_train[i], len(X_train[i]) # for f in X_train[i]: # print len(f) # break # print y_train[i], len(X_train[i]) # # break # start = time() model = ChainCRF(inference_method='max-product', directed=True) ssvm = FrankWolfeSSVM(model=model, C=1.0, max_iter=10) ssvm.fit(X_train, y_train) print 'accuracy of linear-crf %f:' % ssvm.score(X_test, y_test), ' time spend: %f' %(time()-start)
letters = load_letters() X, y, folds = letters["data"], letters["labels"], letters["folds"] # we convert the lists to object arrays, as that makes slicing much more # convenient X, y = np.array(X), np.array(y) X_train, X_test = X[folds == 1], X[folds != 1] y_train, y_test = y[folds == 1], y[folds != 1] # Train linear SVM svm = LinearSVC(dual=False, C=0.1) # flatten input svm.fit(np.vstack(X_train), np.hstack(y_train)) # Train linear chain CRF model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=11) ssvm.fit(X_train, y_train) print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test)) print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test))) # plot some word sequenced n_words = 4 rnd = np.random.RandomState(1) selected = rnd.randint(len(y_test), size=n_words) max_word_len = max([len(y_) for y_ in y_test[selected]]) fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) fig.subplots_adjust(wspace=0) for ind, axes_row in zip(selected, axes): y_pred_svm = svm.predict(X_test[ind])
class PassageTagger(object): def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()} def read_input(self, filename): str_seqs = [] str_seq = [] feat_seqs = [] feat_seq = [] label_seqs = [] label_seq = [] for line in codecs.open(filename, "r", "utf-8"): lnstrp = line.strip() if lnstrp == "": if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] else: if self.do_train: clause, label = lnstrp.split("\t") label_seq.append(label) else: clause = lnstrp str_seq.append(clause) feats = self.fp.get_features(clause) feat_dict = {} for f in feats: if f in feat_dict: feat_dict[f] += 1 else: feat_dict[f] = 1 #feat_dict = {i: v for i, v in enumerate(feats)} feat_seq.append(feat_dict) if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] return str_seqs, feat_seqs, label_seqs def predict(self, feat_seqs): print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs) if self.algorithm == "crf": self.tagger.open(self.trained_model_name) preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs] else: Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: if f in self.feat_index: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) pred_ind_seqs = self.tagger.predict(Xs) preds = [] for ps in pred_ind_seqs: pred = [] for pred_ind in ps: pred.append(self.rev_label_index[pred_ind]) preds.append(pred) return preds def train(self, feat_seqs, label_seqs): print >>sys.stderr, "Training on %d sequences"%len(feat_seqs) if self.algorithm == "crf": for feat_seq, label_seq in zip(feat_seqs, label_seqs): self.trainer.append(ItemSequence(feat_seq), label_seq) self.trainer.train(self.trained_model_name) else: for fs in feat_seqs: for feat_dict in fs: for f in feat_dict: if f not in self.feat_index: self.feat_index[f] = len(self.feat_index) Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) for ls in label_seqs: for label in ls: if label not in self.label_index: self.label_index[label] = len(self.label_index) Ys = [] for ls in label_seqs: Y = [] for label in ls: Y.append(self.label_index[label]) Ys.append(numpy.asarray(Y)) self.trainer.fit(Xs, Ys) pickle.dump(self.trainer, open(self.trained_model_name, "wb")) pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb")) pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
X = X / 16. #y = y.astype(np.int) - 1 X_train, X_test, y_train, y_test = train_test_split(X, y) # we add a constant 1 feature for the bias X_train_bias = np.hstack([X_train, np.ones((X_train.shape[0], 1))]) X_test_bias = np.hstack([X_test, np.ones((X_test.shape[0], 1))]) model = MultiClassClf(n_features=X_train_bias.shape[1], n_classes=10) n_slack_svm = NSlackSSVM(model, verbose=2, check_constraints=False, C=0.1, batch_size=100, tol=1e-2) one_slack_svm = OneSlackSSVM(model, verbose=2, C=.10, tol=.001) subgradient_svm = SubgradientSSVM(model, C=0.1, learning_rate=0.000001, max_iter=1000, verbose=0) fw_bc_svm = FrankWolfeSSVM(model, C=.1, max_iter=50) fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True) # n-slack cutting plane ssvm start = time() n_slack_svm.fit(X_train_bias, y_train) time_n_slack_svm = time() - start y_pred = np.hstack(n_slack_svm.predict(X_test_bias)) print("Score with pystruct n-slack ssvm: %f (took %f seconds)" % (np.mean(y_pred == y_test), time_n_slack_svm)) ## 1-slack cutting plane ssvm start = time() one_slack_svm.fit(X_train_bias, y_train) time_one_slack_svm = time() - start y_pred = np.hstack(one_slack_svm.predict(X_test_bias))
def classify(traincorpus, testcorpus): model = ChainCRF() ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10) pos_lexicon = load_lexicon("lexica/restaurants/ote/pos") term_lexicon = load_lexicon("lexica/restaurants/ote/term") pre1_lexicon = load_lexicon("lexica/restaurants/ote/prefix1") pre2_lexicon = load_lexicon("lexica/restaurants/ote/prefix2") pre3_lexicon = load_lexicon("lexica/restaurants/ote/prefix3") suf1_lexicon = load_lexicon("lexica/restaurants/ote/suffix1") suf2_lexicon = load_lexicon("lexica/restaurants/ote/suffix2") suf3_lexicon = load_lexicon("lexica/restaurants/ote/suffix3") train_sentences = [] #the list to be used to store our features for the words sentence_labels = [] #the list to be used for labeling if a word is an aspect term print('Creating train feature vectors...') #extracting sentences and appending them labels for instance in traincorpus.corpus: words = nltk.word_tokenize(instance.text) tags = nltk.pos_tag(words) tags_list = [] #the pos list for _, t in tags: tags_list.append(t) last_prediction = "" train_words = [] word_labels = [] for i, w in enumerate(words): word_found = False if words[i] == w: word_found = True pos_feats = [] previous_pos_feats = [] second_previous_pos_feats = [] next_pos_feats = [] second_next_pos_feats = [] morph_feats = [] term_feats = [] pre1_feats = [] pre2_feats = [] pre3_feats = [] suf1_feats = [] suf2_feats = [] suf3_feats = [] target_labels = [] train_word_features = [] #prefix of lengths 1,2,3 lexicon features for p1 in pre1_lexicon: if p1 == w[0]: pre1_feats.append(1) else: pre1_feats.append(0) for p2 in pre2_lexicon: if len(w) > 1: if p2 == w[0]+w[1]: pre2_feats.append(1) else: pre2_feats.append(0) else: pre2_feats.append(0) for p3 in pre3_lexicon: if len(w) > 2: if p3 == w[0]+w[1]+w[2]: pre3_feats.append(1) else: pre3_feats.append(0) else: pre3_feats.append(0) #suffix of lengths 1,2,3 lexicon features for s1 in suf1_lexicon: if s1 == w[-1]: suf1_feats.append(1) else: suf1_feats.append(0) for s2 in suf2_lexicon: if len(w) > 1: if s2 == w[-2]+w[-1]: suf2_feats.append(1) else: suf2_feats.append(0) else: suf2_feats.append(0) for s3 in suf3_lexicon: if len(w) > 2: if s3 == w[-3]+w[-2]+w[-1]: suf3_feats.append(1) else: suf3_feats.append(0) else: suf3_feats.append(0) #frequent term lexicon features for t in term_lexicon: if t == w.lower(): term_feats.append(1) else: term_feats.append(0) #morphological features if w[0].isupper(): #is first letter capital morph_feats.append(1) else: morph_feats.append(0) capitals = 0 lowers = 0 for letter in w: if letter.isupper(): capitals = capitals + 1 if letter.islower(): lowers = lowers + 1 if w[0].islower() and capitals > 0: #contains capitals, except 1st letter morph_feats.append(1) else: morph_feats.append(0) if capitals == len(w): #is all letters capitals morph_feats.append(1) else: morph_feats.append(0) if lowers == len(w): #is all letters lower morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"\d", w)) == len(w): #is all letters digits morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[.]", w)) > 0: #is there a '.' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[-]", w)) > 0: #is there a '-' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-' morph_feats.append(1) else: morph_feats.append(0) for p in pos_lexicon: #check the POS tag of the current word if tags_list[i] == p: pos_feats.append(1) else: pos_feats.append(0) #check the POS tag of the previous word (if the index is IN list's bounds) if (i-1) >= 0: if tags_list[i-1] == p: previous_pos_feats.append(1) else: previous_pos_feats.append(0) else: previous_pos_feats.append(0) #check the POS tag of the 2nd previous word (if the index is IN list's bounds) if (i-2) >= 0: if tags_list[i-2] == p: second_previous_pos_feats.append(1) else: second_previous_pos_feats.append(0) else: second_previous_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+1) < len(words): if tags_list[i+1] == p: next_pos_feats.append(1) else: next_pos_feats.append(0) else: next_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+2) < len(words): if tags_list[i+2] == p: second_next_pos_feats.append(1) else: second_next_pos_feats.append(0) else: second_next_pos_feats.append(0) #label the word, using IOB system, #B:start of aspect term, I:continue of aspect term, O: no aspect term term_found = False for aspect_term in set(instance.get_aspect_terms()): term_words = aspect_term.split() for term_index, term in enumerate(term_words): if (w.lower() == term) and (term_found is False): if term_index == 0: target_labels = [1] #1 is "B" last_prediction = "1" term_found = True else: if (last_prediction == "1") or (last_prediction == "2"): target_labels = [2] #2 is "I" last_prediction = "2" term_found = True else: target_labels = [0] last_prediction = "0" if term_found is False: target_labels = [0] #0 is "O" last_prediction = "0" train_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats + next_pos_feats + second_next_pos_feats + morph_feats + term_feats + pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats] if word_found is True: train_words.append(train_word_features) word_labels.append(target_labels) train_sentences_array = np.zeros((len(train_words), len(train_words[0][0]))) index_i = 0 for word in train_words: index_j = 0 for features in word: for f in features: train_sentences_array[index_i, index_j] = f index_j = index_j + 1 index_i = index_i + 1 train_sentences.append(train_sentences_array) sentence_labels_array = np.zeros((len(word_labels))) index_i = 0 for label in word_labels: sentence_labels_array[index_i] = label[0] index_i = index_i + 1 sentence_labels.append(sentence_labels_array.astype(np.int64)) #the chain-crf needs a list (representing the sentences), that #contains a 2d-array(n_words, n_features), which in turn contains the #features extracted from each word. the sentence labels must be #an array of type int ssvm.fit(train_sentences, sentence_labels) print('Done!') print('Creating test feature vectors...') test_sentences = [] for instance in testcorpus.corpus: words = nltk.word_tokenize(instance.text) tags = nltk.pos_tag(words) tags_list = [] #the pos list for _, t in tags: tags_list.append(t) test_words = [] for i, w in enumerate(words): word_found = False if words[i] == w: word_found = True pos_feats = [] previous_pos_feats = [] second_previous_pos_feats = [] next_pos_feats = [] second_next_pos_feats = [] morph_feats = [] term_feats = [] pre1_feats = [] pre2_feats = [] pre3_feats = [] suf1_feats = [] suf2_feats = [] suf3_feats = [] test_word_features = [] #prefix 1,2,3 lexicon features for p1 in pre1_lexicon: if p1 == w[0]: pre1_feats.append(1) else: pre1_feats.append(0) for p2 in pre2_lexicon: if len(w) > 1: if p2 == w[0]+w[1]: pre2_feats.append(1) else: pre2_feats.append(0) else: pre2_feats.append(0) for p3 in pre3_lexicon: if len(w) > 2: if p3 == w[0]+w[1]+w[2]: pre3_feats.append(1) else: pre3_feats.append(0) else: pre3_feats.append(0) #suffix 1,2,3 lexicon features for s1 in suf1_lexicon: if s1 == w[-1]: suf1_feats.append(1) else: suf1_feats.append(0) for s2 in suf2_lexicon: if len(w) > 1: if s2 == w[-2]+w[-1]: suf2_feats.append(1) else: suf2_feats.append(0) else: suf2_feats.append(0) for s3 in suf3_lexicon: if len(w) > 2: if s3 == w[-3]+w[-2]+w[-1]: suf3_feats.append(1) else: suf3_feats.append(0) else: suf3_feats.append(0) #term lexicon features for t in term_lexicon: if t == w.lower(): term_feats.append(1) else: term_feats.append(0) #morphological features if w[0].isupper(): #is first letter capital morph_feats.append(1) else: morph_feats.append(0) capitals = 0 lowers = 0 for letter in w: if letter.isupper(): capitals = capitals + 1 if letter.islower(): lowers = lowers + 1 if w[0].islower() and capitals > 0: #contains capitals, except 1st letter morph_feats.append(1) else: morph_feats.append(0) if capitals == len(w): #is all letters capitals morph_feats.append(1) else: morph_feats.append(0) if lowers == len(w): #is all letters lower morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"\d", w)) == len(w): #is all letters digits morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[.]", w)) > 0: #is there a '.' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r"[-]", w)) > 0: #is there a '-' morph_feats.append(1) else: morph_feats.append(0) if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-' morph_feats.append(1) else: morph_feats.append(0) for p in pos_lexicon: #check the POS tag of the current word if tags_list[i] == p: pos_feats.append(1) else: pos_feats.append(0) #check the POS tag of the previous word (if the index is IN list's bounds) if (i-1) >= 0: if tags_list[i-1] == p: previous_pos_feats.append(1) else: previous_pos_feats.append(0) else: previous_pos_feats.append(0) #check the POS tag of the 2nd previous word (if the index is IN list's bounds) if (i-2) >= 0: if tags_list[i-2] == p: second_previous_pos_feats.append(1) else: second_previous_pos_feats.append(0) else: second_previous_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+1) < len(words): if tags_list[i+1] == p: next_pos_feats.append(1) else: next_pos_feats.append(0) else: next_pos_feats.append(0) #check the POS tag of the next word (if the index is IN list's bounds) if (i+2) < len(words): if tags_list[i+2] == p: second_next_pos_feats.append(1) else: second_next_pos_feats.append(0) else: second_next_pos_feats.append(0) test_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats + next_pos_feats + second_next_pos_feats + morph_feats + term_feats + pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats] if word_found is True: test_words.append(test_word_features) test_sentences_array = np.zeros((len(test_words), len(test_words[0][0]))) index_i = 0 for word in test_words: index_j = 0 for features in word: for f in features: test_sentences_array[index_i, index_j] = f index_j = index_j + 1 index_i = index_i + 1 test_sentences.append(test_sentences_array) print('Done!') print('Predicting aspect terms...') predictions = ssvm.predict(test_sentences) #the predict function returns a list (symbolizing the sentences), #which contains a list that contains the predicted label for each word for sentence_index, sentence_predictions in enumerate(predictions): testcorpus.corpus[sentence_index].aspect_terms = [] predicted_term = "" last_prediction = "" for word_index, word_prediction in enumerate(sentence_predictions): if word_prediction == 1: if last_prediction == 1 or last_prediction == 2: start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term) testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)}) c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index) predicted_term = c last_prediction = 1 elif word_prediction == 2: if last_prediction == 1 or last_prediction == 2: c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index) if len(predicted_term) > 0: predicted_term = predicted_term + " " + c else: predicted_term = c last_prediction = 2 elif word_prediction == 0: if last_prediction == 1 or last_prediction == 2: start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term) testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)}) last_prediction = 0 print('Done!') return testcorpus.corpus
# print("Shuffle results") # features, labels = util.shuffle(features, labels) trsize = int(0.7*len(labels)) X_train = features[1:trsize] y_train = labels[1:trsize] X_test = features[trsize+1:] y_test = labels[trsize+1:] # X_train = X_test = features # y_train = y_test = labels # trsize = len(labels) # Evaluate the chain model = ChainCRF() C=0.0001 max_iter=50 ssvm = FrankWolfeSSVM(model=model, C=C, max_iter=max_iter, verbose=True) print(ssvm) print(ssvm.fit(X_train, y_train)) print(ssvm.w) trscore = ssvm.score(X_train,y_train) # testscore = ssvm.score(X_test,y_test) print("Training score: {0}".format(trscore)) # print("Test score: {0}".format(testscore)) # Save the result # util.saveToSQL(featureset, C, max_iter, trsize, trscore, 2)