def _train_ner_model(): x_train, y_train, lengths_train = load_conll(_download_training_data(), _features) clf = StructuredPerceptron() clf.fit(x_train, y_train, lengths_train) return clf
def simulation(individual): clf = StructuredPerceptron(lr_exponent=0.01, max_iter=100, random_state=2) clf.fit(x_train, y_train, lengths) pred, pred_scores = predict(clf, x_test, lengths_test) test_labels_new = To_AUC_label(y_true) pred_labels_new = To_AUC_label(pred) return (test_labels_new,pred_labels_new)
def _train_ner_model(): import sys if 'nose' in sys.modules: x_train, y_train, lengths_train = load_conll(_load_test_data(), _features) else: x_train, y_train, lengths_train = load_conll( _download_training_data(), _features) clf = StructuredPerceptron() clf.fit(x_train, y_train, lengths_train) return clf
def test_perceptron(): X = [[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]] y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2] clf = StructuredPerceptron(verbose=False, random_state=37, max_iter=15) clf.fit(X, y, [len(y)]) assert_array_equal(y, clf.predict(X)) # Try again with string labels and sparse input. y_str = np.array(["eggs", "ham", "spam"])[y] clf = clone(clf) clf.fit(csc_matrix(X), y_str, [len(y_str)]) assert_array_equal(y_str, clf.predict(coo_matrix(X))) X2 = np.vstack([X, X]) y2 = np.hstack([y_str, y_str]) assert_array_equal(y2, clf.predict(X2, lengths=[len(y), len(y)]))
def test_perceptron(): X = [[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]] y = [0, 0, 0, 0, 1, 1, 0, 2, 2] clf = StructuredPerceptron(verbose=True).fit(X, y, [len(y)]) assert_array_equal(y, clf.predict(X))
def train_model(): print "Loading training data..." X_train, y_train, lengths_train = load_conll(os.path.join( current_dir, "./../data/train.conll"), features) clf = StructuredPerceptron(verbose=True,max_iter = 10) describe(X_train, lengths_train) print "Loading test data..." X_test, y_test, lengths_test = load_conll(os.path.join( current_dir, "./../data/test.conll"), features) describe(X_test, lengths_test) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred))) return clf
def main(): print("Loading data") #Useful messages dat = open(sys.argv[1]) # get filename and open the correct file addcol(dat, sys.argv[1]) X_test, y_test, l_test = load_conll( "test.txt", features) # load the test set created by addcol data = load_dat() # yet another file loading function! X_train, y_train, l_train = load_con(data, features) # the big loading file per = StructuredPerceptron(lr_exponent=0.35, max_iter=20, verbose=1) # Some trial and error found that # a lr of .35 and 20 iters worked best print("Fitting") per.fit(X_train, y_train, l_train) # fit and predict y_p = per.predict(X_test, l_test) create_eval_file(y_p) # save print("Done!")
def createModel(data, labels, cv_flag=False): errors = list() clf = StructuredPerceptron(verbose=1,max_iter=1000,random_state=1) print("Structured Perceptron 2") if cv_flag: print("Cross-Validation") errors.append(evaluateModel(clf, data, labels, True)) else: X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.3, random_state=1) train_start = time() clf = clf.fit(X_train, Y_train,lengths=(30,)) train_end = time() print("Training took " + str((train_end - train_start) / 60) + " minutes to complete\n") print("Results\n") print("Train") errors.append(evaluateModel(clf, X_train, Y_train)) print("Test") errors.append(evaluateModel(clf, X_test, Y_test)) return errors
def train_model(): print "Loading training data..." X_train, y_train, lengths_train = load_conll( os.path.join(current_dir, "./../data/train.conll"), features) clf = StructuredPerceptron(verbose=True, max_iter=10) describe(X_train, lengths_train) print "Loading test data..." X_test, y_test, lengths_test = load_conll( os.path.join(current_dir, "./../data/test.conll"), features) describe(X_test, lengths_test) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred))) return clf
def main(): #Load in training data and pass it through our feature function. #See documentation exact outputs of load_conll samples, labels, sentence_lengths = load_conll("data/gene-trainF18.txt", features, split=True) #Train the model with our features clf = StructuredPerceptron() clf.fit(samples, labels, sentence_lengths) #Evaluate our model test_samples, test_labels, test_sentence_lengths = load_conll( "data/F18-assgn4-test.txt", features, split=True) prediction = clf.predict(test_samples, test_sentence_lengths) #Output results i = 0 j = 1 output = [] for line in open("data/F18-assgn4-test.txt"): if (line == "\n"): output.append("\n") j = 1 continue else: item = str( j) + "\t" + line.split()[1] + "\t" + prediction[i] + "\n" output.append(item) print(item) i += 1 j += 1 with open('predictions.txt', 'w') as f: for item in output: f.write(item)
def perceptronTest(input_data, test, actual, actual8): model = StructuredPerceptron(verbose=False, random_state=37, max_iter=1000) l = 9 * len(input_data) / 10 - 1 scores = cross_val_score(model, input_data.iloc[:, :-1], input_data.iloc[:, -1], cv=10, fit_params={'lengths': [l]}) print 'Cross Validation Accuracy = ' + str(scores.mean()) model.fit(input_data.iloc[:, :-1], input_data.iloc[:, -1], [len(input_data)]) pred = model.predict(test) accuracy = sum(pred == actual) / float(len(actual)) spred = smoothening(pred) saccuracy = sum(spred == actual) / float(len(actual)) pred8 = mergePredictions(pred, 8) accuracy8 = sum(pred8 == actual8) / float(len(actual8)) print 'Test Accuracy for the subject is = ' + str(accuracy) print 'Test Accuracy after smoothening for the subject is = ' + str( saccuracy) print 'Test Accuracy for the subject at step 8 is = ' + str(accuracy8)
def test_perceptron_mask(): X = [[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]] y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2] trans_constraints = [('spam','eggs'), ('spam', 'ham')] clf = StructuredPerceptron(verbose=True, random_state=42, max_iter=15, trans_constraints=trans_constraints) # Try again with string labels and sparse input. y_str = np.array(["eggs", "ham", "spam"])[y] clf.fit(csc_matrix(X), y_str, [len(y_str)]) # Still fits assert_array_equal(y_str, clf.predict(coo_matrix(X))) # Weights are overridden properly assert_array_equal([clf.intercept_trans_[2,0], clf.intercept_trans_[2,1]], [clf.CONSTRAINT_VALUE]*2) # Add impossible constriants and model should fail to converge impossible_constraints = [('spam','eggs'), ('eggs', 'ham')] clf2 = StructuredPerceptron(verbose=True, random_state=12, max_iter=15, trans_constraints=impossible_constraints) clf2.fit(csc_matrix(X), y_str, [len(y_str)]) # Should raise error saying that prediction is incorrect assert_raises(AssertionError, assert_array_equal, y_str, clf2.predict(coo_matrix(X)))
# next word's length yield "next_len=" + str(get_word_len(next)) if i < len(sequence) - 1: next = sequence[i + 1].split("\t")[1] # last letters of the next word yield "next_last_letters=" + (next[-3:] if len(next) > 3 else next) if i < len(sequence) - 1: next = sequence[i + 1].split("\t")[1] yield "next_short_word_shape=" + get_short_word_shape(next) # читаем обучающее множество X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features) clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev)) print("Element-wise accuracy ", accuracy_score(y_dev, y_pred)) print("Mean F1-score macro ", f1_score(y_dev, y_pred, average="macro")) print("\nPredictions on test set")
class MLVSA(object): def __init__(self, inst_train, label_train, inst_test, label_test, seq_len, model_option): self.seq_len = seq_len self.model_option = model_option self.X_train, self.Y_train, self.n_class = self.list2np( inst_train, label_train, seq_len) self.X_test, self.Y_test, _, = self.list2np(inst_test, label_test, seq_len) def predict_classes(self, proba): if proba.shape[-1] > 1: return proba.argmax(axis=-1) else: return (proba > 0.5).astype('int32') def list2np(self, inst, label, seq_len): label[label == 10] = 0 label[label == 20] = 1 label[label == 30] = 2 label[label == 40] = 3 n_class = 4 num_sample = inst.shape[0] / seq_len X = inst[0:(num_sample * seq_len), ].reshape(num_sample, seq_len) Y = label[0:(num_sample * seq_len), ].reshape(num_sample, seq_len) return X, Y, n_class def fit(self): self.X_train = self.X_train self.Y_train = self.Y_train self.X_test = self.X_test self.Y_test = self.Y_test print '================================================' print "Data shape..." print self.X_train.shape print self.Y_train.shape print self.X_test.shape print self.Y_test.shape print "Counting the number of data in each category..." print collections.Counter(self.Y_train.flatten()) print collections.Counter(self.Y_test.flatten()) print '================================================' if self.model_option == 0: print "Using SVM >>>>>>>>>>>>>>>>>>>>>>>" self.model = svm.SVC(kernel='rbf', decision_function_shape='ovo') self.y_pred = np.zeros_like(self.Y_test) for i in xrange(self.seq_len): self.model.fit(self.X_train[:, i].reshape(-1, 1), self.Y_train[:, i].reshape(-1, 1)) self.y_pred[:, i] = self.model.predict(self.X_test[:, i].reshape( -1, 1)) elif self.model_option == 1: print "Using RF >>>>>>>>>>>>>>>>>>>>>>>>" self.model = rf(n_estimators=100) self.y_pred = np.zeros_like(self.Y_test) for i in xrange(self.seq_len): self.model.fit(self.X_train[:, i].reshape(-1, 1), self.Y_train[:, i].reshape(-1, 1)) self.y_pred[:, i] = self.model.predict(self.X_test[:, i].reshape( -1, 1)) elif self.model_option == 2: print "Using HMM >>>>>>>>>>>>>>>>>>>>>>>" self.X_train = self.X_train.flatten() self.Y_train = self.Y_train.flatten() self.X_train = self.X_train.reshape(self.X_train.shape[0], 1) self.X_test = self.X_test.flatten() self.Y_test = self.Y_test.flatten() self.X_test = self.X_test.reshape(self.X_test.shape[0], 1) self.model = StructuredPerceptron() oh = onehot() self.X_train = oh.fit_transform(self.X_train) self.X_test = oh.transform(self.X_test) num_seq = self.X_train.shape[0] / self.seq_len length_train = [self.seq_len] * num_seq self.model.fit(self.X_train, self.Y_train, length_train) num_seq = self.X_test.shape[0] / self.seq_len length_test = [self.seq_len] * num_seq self.y_pred = self.model.predict(self.X_test, length_test) elif self.model_option == 3: print "Using CRF >>>>>>>>>>>>>>>>>>>>>>>" trainer = pycrfsuite.Trainer(verbose=True) for i in xrange(self.X_train.shape[0]): trainer.append(self.X_train[i, ].astype('string'), self.Y_train[i, ].astype('string')) trainer.set_params({ 'c1': 0.1, 'c2': 0.01, 'max_iterations': 2000, 'feature.possible_transitions': True }) trainer.train('crf.model') tagger = pycrfsuite.Tagger() tagger.open('crf.model') self.y_pred = [] for i in xrange(self.X_test.shape[0]): self.y_pred.append( np.array(tagger.tag( self.X_test[i, ].astype('string'))).astype('int')) self.y_pred = np.array(self.y_pred) print 'Evaluating testing results' precision, recall, f1, _ = precision_recall_fscore_support( self.Y_test.flatten(), self.y_pred.flatten(), labels=[0, 1, 2, 3], average='weighted') print("Precision: %s Recall: %s F1: %s" % (precision, recall, f1)) print '================================================' for i in xrange(4): print 'Evaluating testing results of positive labels at region ' + str( i) precision, recall, f1, _ = precision_recall_fscore_support( self.Y_test.flatten(), self.y_pred.flatten(), labels=[i], average='weighted') print("Precision: %s Recall: %s F1: %s" % (precision, recall, f1)) print '================================================' return self.y_pred
def fit(self): self.X_train = self.X_train self.Y_train = self.Y_train self.X_test = self.X_test self.Y_test = self.Y_test print '================================================' print "Data shape..." print self.X_train.shape print self.Y_train.shape print self.X_test.shape print self.Y_test.shape print "Counting the number of data in each category..." print collections.Counter(self.Y_train.flatten()) print collections.Counter(self.Y_test.flatten()) print '================================================' if self.model_option == 0: print "Using SVM >>>>>>>>>>>>>>>>>>>>>>>" self.model = svm.SVC(kernel='rbf', decision_function_shape='ovo') self.y_pred = np.zeros_like(self.Y_test) for i in xrange(self.seq_len): self.model.fit(self.X_train[:, i].reshape(-1, 1), self.Y_train[:, i].reshape(-1, 1)) self.y_pred[:, i] = self.model.predict(self.X_test[:, i].reshape( -1, 1)) elif self.model_option == 1: print "Using RF >>>>>>>>>>>>>>>>>>>>>>>>" self.model = rf(n_estimators=100) self.y_pred = np.zeros_like(self.Y_test) for i in xrange(self.seq_len): self.model.fit(self.X_train[:, i].reshape(-1, 1), self.Y_train[:, i].reshape(-1, 1)) self.y_pred[:, i] = self.model.predict(self.X_test[:, i].reshape( -1, 1)) elif self.model_option == 2: print "Using HMM >>>>>>>>>>>>>>>>>>>>>>>" self.X_train = self.X_train.flatten() self.Y_train = self.Y_train.flatten() self.X_train = self.X_train.reshape(self.X_train.shape[0], 1) self.X_test = self.X_test.flatten() self.Y_test = self.Y_test.flatten() self.X_test = self.X_test.reshape(self.X_test.shape[0], 1) self.model = StructuredPerceptron() oh = onehot() self.X_train = oh.fit_transform(self.X_train) self.X_test = oh.transform(self.X_test) num_seq = self.X_train.shape[0] / self.seq_len length_train = [self.seq_len] * num_seq self.model.fit(self.X_train, self.Y_train, length_train) num_seq = self.X_test.shape[0] / self.seq_len length_test = [self.seq_len] * num_seq self.y_pred = self.model.predict(self.X_test, length_test) elif self.model_option == 3: print "Using CRF >>>>>>>>>>>>>>>>>>>>>>>" trainer = pycrfsuite.Trainer(verbose=True) for i in xrange(self.X_train.shape[0]): trainer.append(self.X_train[i, ].astype('string'), self.Y_train[i, ].astype('string')) trainer.set_params({ 'c1': 0.1, 'c2': 0.01, 'max_iterations': 2000, 'feature.possible_transitions': True }) trainer.train('crf.model') tagger = pycrfsuite.Tagger() tagger.open('crf.model') self.y_pred = [] for i in xrange(self.X_test.shape[0]): self.y_pred.append( np.array(tagger.tag( self.X_test[i, ].astype('string'))).astype('int')) self.y_pred = np.array(self.y_pred) print 'Evaluating testing results' precision, recall, f1, _ = precision_recall_fscore_support( self.Y_test.flatten(), self.y_pred.flatten(), labels=[0, 1, 2, 3], average='weighted') print("Precision: %s Recall: %s F1: %s" % (precision, recall, f1)) print '================================================' for i in xrange(4): print 'Evaluating testing results of positive labels at region ' + str( i) precision, recall, f1, _ = precision_recall_fscore_support( self.Y_test.flatten(), self.y_pred.flatten(), labels=[i], average='weighted') print("Precision: %s Recall: %s F1: %s" % (precision, recall, f1)) print '================================================' return self.y_pred
yield "folUpper" if re.search(r"\d", nnp.lower()): yield "folNumber" yield "folword=" + nnp.lower() if p.isupper() and len(p) == 3: yield "Uppercase" if re.search(r"\d", p.lower()): yield "Number" if len(p) > 8: # check if current word is unusually long yield "Long" if __name__ == '__main__': train_path = "../Data/bio-ner/train" dev_path = "../Data/bio-ner/dev" # create_file(train_path, "train") # create_file(dev_path, "dev") X_train, y_train, l_train = load_conll("train", features) X_test, y_test, l_test = load_conll("dev", features) per = StructuredPerceptron(lr_exponent=0.15, max_iter=300, verbose=1) per.fit(X_train, y_train, l_train) y_p = per.predict(X_test, l_test) # for x in zip(y_p, y_test): # print(x) print(bio_f_score(y_test, y_p))
if len(token_tag) >= 3: f.write(token_tag[1] + '\t' + token_tag[2]) ''' if __name__ == "__main__": print(__doc__) load_data() print("Loading training data...", end=" ") X_train, y_train, lengths_train = load_conll('input_train.txt', features) describe(X_train, lengths_train) print("Loading test data...", end=" ") X_test, y_test, lengths_test = load_conll('input_test.txt', features) describe(X_test, lengths_test) clf = StructuredPerceptron(verbose=True, lr_exponent=0.1, max_iter=30) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) ''' f = open('input_test_key.txt', 'w') for i in y_pred: f.write(str(i)) ''' f = open('Li-Zhenqi-assgn4-output.txt', 'w') i=0 for line in open('F18-assgn4-test.txt', 'r'): if line == '\n': f.write('\n') else:
def test_perceptron(): X = [[0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]] y = [0, 0, 0, 0, 0, 1, 1, 0, 2, 2] clf = StructuredPerceptron(verbose=False, random_state=37, max_iter=15) clf.fit(X, y, [len(y)]) assert_array_equal(y, clf.predict(X)) # Try again with string labels and sparse input. y_str = np.array(["eggs", "ham", "spam"])[y] clf = clone(clf) clf.fit(csc_matrix(X), y_str, [len(y_str)]) assert_array_equal(y_str, clf.predict(coo_matrix(X))) X2 = np.vstack([X, X]) y2 = np.hstack([y_str, y_str]) assert_array_equal(y2, clf.predict(X2, lengths=[len(y), len(y)])) # Train with Viterbi, test with best-first to make StructuredPerceptron # behave a bit more like a linear model. clf.fit(X, y, [len(y)]) clf.set_params(decode="bestfirst") y_linearmodel = np.dot(X, clf.coef_.T).argmax(axis=1) assert_array_equal(clf.predict(X), y_linearmodel)
return train, test if __name__ == "__main__": training = "gene-trainF18.txt" testing = "test-run-test.txt" outputFile = "assign4-output.txt" train, test = load_data(training, testing) X_train, y_train, lengths_train = train X_test, y_test, lengths_test = test score = 0 for i in range(30): # train data sequence with perceptron in example clf = StructuredPerceptron(verbose=True, max_iter=10) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) # predicted IOB tags sequence from test data y_pred = clf.predict(X_test, lengths_test) outputCopy = open(outputFile, "w+") counter = 0 with open(testing) as file: for line in file: if line != "\n": line = line.rstrip() outputCopy.write(line + "\t" + y_pred[counter] + "\n") counter += 1 else:
test_files = [f for i, f in enumerate(files) if i % 5 == 0] test = load_conll(fileinput.input(test_files), features) X_test, _, lengths_test = test describe(X_test, lengths_test) return train, test if __name__ == "__main__": print(__doc__) #print("Loading training data...", end=" ") #X_train, y_train, lengths_train = load_conll(sys.argv[1], features) #describe(X_train, lengths_train) train, test = load_data() X_train, y_train, lengths_train = train X_test, y_test, lengths_test = test #print("Loading test data...", end=" ") #X_test, y_test, lengths_test = load_conll(sys.argv[2], features) #describe(X_test, lengths_test) clf = StructuredPerceptron(verbose=True, max_iter=10) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
if __name__ == "__main__": print(__doc__) #print("Loading training data...", end=" ") #X_train, y_train, lengths_train = load_conll(sys.argv[1], features) #describe(X_train, lengths_train) train = load_data() X_train, y_train, lengths_train = train #X_test, y_test, lengths_test = test #print("Loading test data...", end=" ") #X_test, y_test, lengths_test = load_conll(sys.argv[2], features) #describe(X_test, lengths_test) clf = StructuredPerceptron(verbose=True, max_iter=10) print("Training %s" % clf) #print(X_train) #print(y_train) #print(y_train.shape) clf.fit(X_train, y_train, lengths_train) joblib.dump(clf, 'model/seq_labeler.pkl') #clf1 = joblib.load('model/seq_labeler.pkl') #y_pred = clf1.predict(X_test, lengths_test) #y_pred = clf.predict(X_test, lengths_test) #target = codecs.open("../training data/testres", "w", "utf-8")
break else: if s.isalpha(): training_seq.append(s) file.close() file = open("training_seq.txt", "w") for ch in training_seq: c = ch.join(' ') file.write(c) file.write('\n') file.close() clf = StructuredPerceptron() def features(sequence, i): yield "word=" + sequence[i].lower() if sequence[i].isupper(): yield "Uppercase" X_train, y_train, lengths_train = load_conll("training_seq.txt", features) clf = StructuredPerceptron() clf.fit(X_train, y_train, lengths_train) ''' names of the files that wanted to be test on '''
if i < len(sequence) - 1: next_ = sequence[i + 1].split("\t")[0] # next word's length yield "next_len=" + str(get_word_len(next_)) # last letters of the next word yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_) yield "next_word_shape=" + get_word_shape(next_) yield "next_short_word_shape=" + get_short_word_shape(next_) # читаем обучающее множество X_train, y_train, lengths_train = load_conll( open("ftb1u-v1/ftb1u_train.tsv", "r"), features) clf = StructuredPerceptron(decode="viterbi", verbose=1) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on test set") # читаем тестовое множество X_test, y_test, lengths_test = load_conll(open("ftb1u-v1/ftb1u_test.tsv", "r"), features) y_pred = clf.predict(X_test, lengths_test) print("Whole seq accuracy ", whole_sequence_accuracy(y_test, y_pred, lengths_test)) print("Element-wise accuracy ", accuracy_score(y_test, y_pred)) print("Mean F1-score macro ", f1_score(y_test, y_pred, average="macro")) print(classification_report(y_test, y_pred))
def test_perceptron_single_iter(): """Assert that averaging works after a single iteration.""" clf = StructuredPerceptron(max_iter=1) clf.fit([[1, 2, 3]], [1], [1]) # no exception
yield "next_len=" + str(get_word_len(next_)) # last letters of the next word yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_) yield "next_short_word_shape=" + get_short_word_shape(next_) if i < len(sequence) - 2: nnext = sequence[i + 2].split("\t")[0] yield "nnext_short_word_shape=" + get_short_word_shape(nnext) # читаем обучающее множество X_train, y_train, lengths_train = load_conll( open("finer-data/data/digitoday.2014.train.csv", "r"), features) clf = StructuredPerceptron(decode="bestfirst", verbose=1, random_state=0) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll( open("finer-data/data/digitoday.2014.dev.csv", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev)) print("Element-wise accuracy ", accuracy_score(y_dev, y_pred)) print("Mean F1-score macro ", f1_score(y_dev, y_pred, average="macro"))
def __init__(self, input_encoding, conversion_key, n_iter_seq): self.input_encoding = input_encoding self.conversion_key = conversion_key # self.model = MultinomialHMM() self.model = StructuredPerceptron(max_iter=n_iter_seq)
def testStructuredPerceptron(data, classes, seq_lengths, n_folds, metric=''): clf = StructuredPerceptron(max_iter=10) baseSeqClassifierTest(clf, "Structured Perceptron", data, classes, seq_lengths, n_folds, metric)
sub1_1=scipy.io.loadmat('train_subject1_psd01.mat') sub1_X1=sub1_1['X'] sub1_Y1= sub1_1['Y'] sub1_2=scipy.io.loadmat('train_subject1_psd02.mat') sub1_X2=sub1_2['X'] sub1_Y2= sub1_2['Y'] sub1_3=scipy.io.loadmat('train_subject1_psd03.mat') sub1_X3=sub1_3['X'] sub1_Y3= sub1_3['Y'] sub1_X=np.concatenate((sub1_X1, sub1_X2, sub1_X3), axis=0) sub1_Y=np.concatenate((sub1_Y1, sub1_Y2,sub1_Y3), axis=0) sub1_clf = StructuredPerceptron(decode='viterbi', lr_exponent=0.1, max_iter=10000, random_state=None, trans_features=False, verbose=0) sub1_clf.fit(sub1_X, sub1_Y,[len(sub1_Y)]) sub1_test=scipy.io.loadmat('test_subject1_psd04.mat') sub1_X4=sub1_test['X'] sub1_predicted=sub1_clf.predict(sub1_X4) sub1_Y4=np.loadtxt('test_subject1_true_label.csv',delimiter=",") print 'subject-1',accuracy_score(sub1_predicted, sub1_Y4) print confusion_matrix(sub1_Y4, sub1_predicted) #3017/3504 : subject 1 accuracy start subject-2 sub2_1=scipy.io.loadmat('train_subject2_psd01.mat') sub2_X1=sub2_1['X'] sub2_Y1= sub2_1['Y']
def trainStructuredPerceptron(data, classes, seq_lengths, dump_file): clf = StructuredPerceptron(max_iter=10) baseSeqClassifierTrain(clf, "Structured Perceptron", data, classes, seq_lengths, dump_file)
te_label_1y["label_numbers"] = te_label_1y["label"].apply(type_to_numbers1) """ alldata = [np.array([])] * 63 for i in range(0, 63): col = test_df.iloc[:, i].values col_uni = np.unique(col[~np.isnan(col)]) alldata[i] = np.concatenate((alldata[i], col_uni), axis=0) ipdb.set_trace() np.savetxt("test_random.txt", alldata) assert 0 """ ''' for 5 year prediction ''' # hmm = MultinomialHMM() hmm = StructuredPerceptron() hmm.fit(train_df, tr_label_5y["label_numbers"], train_seqlength) pred = hmm.predict(test_df, test_seqlength) # print(roc_auc_score(te_label_5y["label_numbers"], pred, average="macro")) onehot_encoder = OneHotEncoder(sparse=False) pred = pred.reshape(len(pred), 1) pred = onehot_encoder.fit_transform(pred) label = te_label_5y["label_numbers"].values.reshape( len(te_label_5y["label_numbers"]), 1) label = onehot_encoder.fit_transform(label) auc_per_class = np.zeros((5, )) for i in range(5): pred_temp = pred[:, i] label_temp = label[:, i]
class SeqModel(): def __init__(self, input_encoding, conversion_key, n_iter_seq): self.input_encoding = input_encoding self.conversion_key = conversion_key # self.model = MultinomialHMM() self.model = StructuredPerceptron(max_iter=n_iter_seq) def train(self, trainset): print("Training ...") start_time = time.time() plot_losses = [] plot_distances = [] X_train, Y_train, lengths_train = trainset.get_set() # Perform training self.model.fit(X_train, Y_train, lengths=lengths_train) duration = time.time() - start_time print("Duration = {0:.2f}".format(duration)) return plot_losses, plot_distances def _compute_distance(self, X, Y, predictions=None, word_lengths=None): distances_t_p = [] distances_s_t = [] input_words = [] target_words = [] predicted_words = [] # X, Y and predictions are long lists of characters. Split into words again. if predictions is not None: predictions = data.split_predictions(predictions, word_lengths) X = data.split_predictions(X, word_lengths) Y = data.split_predictions(Y, word_lengths) for ex in np.arange(len(X)): # X is encoded and has to be decoded _, input_tokens = data.word_surface(X[ex], self.conversion_key[0], self.input_encoding) # Y is already in surface form #target_word = "".join(Y[ex]) target_tokens = Y[ex] input_cut = [t for t in input_tokens if t != "."] target_cut = [t for t in target_tokens if t != "."] if predictions is not None: # Predictions are already in surface form #predicted_word = "".join(predictions[ex]) predicted_tokens = predictions[ex] predicted_cut = [t for t in predicted_tokens if t != "."] dist_t_p = utility.calculate_levenshtein(target_cut, predicted_cut) distances_t_p.append(dist_t_p) predicted_words.append(predicted_cut) dist_s_t = utility.calculate_levenshtein(input_cut, target_cut) distances_s_t.append(dist_s_t) input_words.append(input_cut) target_words.append(target_cut) if predictions is not None: return input_words, target_words, predicted_words, distances_t_p, distances_s_t else: return np.mean(distances_s_t) def predict(self, testset, print_output=True): all_distances_t_p = [] all_distances_s_t = [] all_input_words = [] all_target_words = [] all_predicted_words = [] if print_output: text_output = "" header_template = "{0:20} {1:20} {2:20} {3:8}" template = "{0:20} {1:20} {2:20} {3:.2f}" text_output += header_template.format("INPUT", "TARGET", "PREDICTION", "DISTANCE") + "\n" # Fetch whole test set in format suitable for seqmodel X_test, Y_test, lengths_test = testset.get_set() predictions = self.model.predict(X_test, lengths=lengths_test) input_words, target_words, predicted_words, distances_t_p, distances_s_t = self._compute_distance(X_test, Y_test, predictions, lengths_test) all_distances_t_p += distances_t_p all_distances_s_t += distances_s_t all_input_words += input_words all_target_words += target_words all_predicted_words += predicted_words row_dict = defaultdict(list) for i in np.arange(len(all_input_words)): input_word = all_input_words[i] target_word = all_target_words[i] predicted_word = all_predicted_words[i] dist_t_p = all_distances_t_p[i] dist_s_t = all_distances_s_t[i] if print_output: text_output += template.format("".join(input_word), "".join(target_word), "".join(predicted_word), dist_t_p) + "\n" row_dict["INPUT"].append(" ".join(input_word)) row_dict["TARGET"].append(" ".join(target_word)) row_dict["PREDICTION"].append(" ".join(predicted_word)) row_dict["DISTANCE_T_P"].append(dist_t_p) row_dict["DISTANCE_S_T"].append(dist_s_t) # Get information from datafile records = testset.get_datafile_record(i) row_dict["CONCEPT"].append(records[0].iloc[0]["CONCEPT"]) # Add columns for cognate judgments of both word1 and word2 for ix in [0, 1]: if "COGNATES_LEXSTAT" in records[ix]: row_dict["COGNATES_LEXSTAT" + str(ix)].append(records[ix].iloc[0]["COGNATES_LEXSTAT"]) if "COGNATES_IELEX" in records[ix]: row_dict["COGNATES_IELEX" + str(ix)].append(records[ix].iloc[0]["COGNATES_IELEX"]) avg_distance = np.average(all_distances_t_p) if print_output: text_output += "Average distance: " + str(avg_distance) + "\n" print(text_output) results_table = pd.DataFrame(row_dict) return avg_distance, results_table