def load_data(trainingPath, testPath): print("Loading training data...", end=" ") train = load_conll(fileinput.input(trainingPath), features) X_train, _, lengths_train = train describe(X_train, lengths_train) # Filler third column of testing data with fillers because load_conll wants 3 columns # We are only given 2 columns in test data, will not work without newTest = "newTest.txt" postTest = open(newTest, "w+") with open(testPath) as file: for line in file: if line != "\n": # strip \n from end of file line = line.rstrip() # add filler to end of line postTest.write(line + "\t NonApplicable\n") postTest.close() print("Loading test data...", end=" ") test = load_conll(fileinput.input(newTest), features) X_test, _, lengths_test = test describe(X_test, lengths_test) return train, test
def load_con( data, features, tts=False ): # loads the training data and adds in POS tags can do test split too np.random.shuffle(data) if tts: # Code for doing train-test split split = int(len(data) * tts) train = data[:split] test = data[split:] if os.path.exists("test.txt"): # replace files instead of adding on os.remove("test.txt") if os.path.exists("gs.txt"): os.remove("gs.txt") with open("test.txt", "a") as f: # Create teh fhe file for testing for s in test: pos = nl.pos_tag([w[0] for w in s]) for ii in range(len(s)): f.write("{}\t{}\t{}\n".format(s[ii][0], pos[ii][1], s[ii][1])) f.write("\n") with open("gs.txt", "a") as f: #create the gold standard file for comparison for s in test: for ii in range(len(s)): f.write("{}\t{}\n".format(s[ii][0], s[ii][1])) f.write("\n") X_test, y_test, l_test = load_conll("test.txt", features) else: train = data # if not doing the tts then just use all the data to train if os.path.exists("train.txt"): # replace the training fil os.remove("train.txt") with open("train.txt", "a") as f: # Create the train file and add in the POS for s in train: pos = nl.pos_tag([w[0] for w in s]) for ii in range(len(s)): f.write("{}\t{}\t{}\n".format(s[ii][0], pos[ii][1], s[ii][1])) f.write("\n") X_train, y_train, l_train = load_conll( "train.txt", features) # use the load_conll function on the generated file if tts: # variable return based on if you want to have a test-train split return X_train, X_test, y_train, y_test, l_train, l_test else: return X_train, y_train, l_train
def gesture_to_conll(train_data, test_data, train_labels, test_labels): path_to_train_file = write_conll_file("train_gestures.txt", train_data, train_labels) path_to_test_file = write_conll_file("test_gestures.txt", test_data, test_labels) x_train, y_train, train_lengths = load_conll(path_to_train_file, extract_features) x_test, y_test, test_lengths = load_conll(path_to_test_file, extract_features) return x_train, y_train, x_test, y_test, train_lengths, test_lengths
def _train_ner_model(): import sys if 'nose' in sys.modules: x_train, y_train, lengths_train = load_conll(_load_test_data(), _features) else: x_train, y_train, lengths_train = load_conll( _download_training_data(), _features) clf = StructuredPerceptron() clf.fit(x_train, y_train, lengths_train) return clf
def load_data(): #Use this to load in our data so that we can pass it in to some machine learning algorithm #We return a training data set and a test data set print("Loading training data...", end=" ") train = load_conll(fileinput.input("gene-trainF18.txt"), features) X_train, _, lengths_train = train describe(X_train, lengths_train) print("Loading test data...", end=" ") test = load_conll(fileinput.input("testset.txt"), features) X_test, _, lengths_test = test describe(X_test, lengths_test) return train, test
def predict_cars(clf, sentence): test_f_name = os.path.join(current_dir, './../data/test_ann') sentence = preprocessor_text(sentence) tokens = word_tokenize(sentence) with codecs.open(test_f_name, 'w', 'utf-8') as f: for t in tokens: f.write(t + u' ' + u'O' + u'\n') f.flush() X, y, lengths = load_conll(test_f_name, features) y_pred = clf.predict(X, lengths) found_cars = [] current_car = [] found_car = False for idx, token in enumerate(y_pred): t = str(token) if t == 'B': current_car.append(tokens[idx]) found_car = True elif t == 'I': current_car.append(tokens[idx]) else: if found_car: found_car = False found_cars.append(u' '.join(current_car)) current_car = [] if len(current_car) > 0: found_cars.append(u' '.join(current_car)) return found_cars
def predict_cars(clf, sentence): test_f_name = os.path.join( current_dir, './../data/test_ann' ) sentence = preprocessor_text( sentence ) tokens = word_tokenize(sentence) with codecs.open(test_f_name, 'w', 'utf-8') as f: for t in tokens: f.write(t + u' ' + u'O' + u'\n') f.flush() X, y, lengths= load_conll(test_f_name, features) y_pred = clf.predict(X, lengths) found_cars = [] current_car = [] found_car = False for idx, token in enumerate( y_pred ): t = str(token) if t == 'B': current_car.append(tokens[idx]) found_car = True elif t == 'I': current_car.append(tokens[idx]) else: if found_car: found_car = False found_cars.append( u' '.join(current_car) ) current_car = [] if len(current_car) > 0: found_cars.append( u' '.join(current_car) ) return found_cars
def _train_ner_model(): x_train, y_train, lengths_train = load_conll(_download_training_data(), _features) clf = StructuredPerceptron() clf.fit(x_train, y_train, lengths_train) return clf
def load_data(): files = glob("../training data/new training/new ent 2/*.txt")#"../training data/new training/new_file*.txt") # 80% training, 20% test print("Loading training data...", end=" ") """train_files = [f for i, f in enumerate(files) if i % 5 != 0] train = load_conll(fileinput.input(train_files), features)#, split=True) X_train, _, lengths_train = train describe(X_train, lengths_train)""" train_files = [f for i, f in enumerate(files)]# if i % 5 != 0] print( train_files) train = load_conll(fileinput.input(train_files), features)#, split=True) X_train, _, lengths_train = train describe(X_train, lengths_train) print("Loading test data...", end=" ") """test_files = [f for i, f in enumerate(glob("../training data/new training/new_test.txt"))] test = load_conll(fileinput.input(test_files), features) X_test, _, lengths_test = test describe(X_test, lengths_test)""" return train#, test
def load_data(): files = glob('nerdata/*.bio') # 80% training, 20% test print("Loading training data...", end=" ") train_files = [f for i, f in enumerate(files) if i % 5 != 0] train = load_conll(fileinput.input(train_files), features) X_train, _, lengths_train = train describe(X_train, lengths_train) print("Loading test data...", end=" ") test_files = [f for i, f in enumerate(files) if i % 5 == 0] test = load_conll(fileinput.input(test_files), features) X_test, _, lengths_test = test describe(X_test, lengths_test) return train, test
def testHMM(clf, data): # Validation after training X_test, y_test, lengths_test = load_conll(data, features) y_pred = clf.predict(X_test, lengths_test) print y_pred # # Final score print(whole_sequence_accuracy(y_test, y_pred, lengths_test))
def train_model(): print "Loading training data..." X_train, y_train, lengths_train = load_conll(os.path.join( current_dir, "./../data/train.conll"), features) clf = StructuredPerceptron(verbose=True,max_iter = 10) describe(X_train, lengths_train) print "Loading test data..." X_test, y_test, lengths_test = load_conll(os.path.join( current_dir, "./../data/test.conll"), features) describe(X_test, lengths_test) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred))) return clf
def test_load_conll(): n_nonempty = sum(1 for ln in TEST_FILE.splitlines() if ln.strip()) X, y, lengths = load_conll(six.moves.StringIO(TEST_FILE), features) assert_true(sp.isspmatrix(X)) assert_equal(X.shape[0], n_nonempty) assert_equal(list(y), ["Det", "N", "V", "Pre", "Det", "N", "Punc", "Adv", "Punc"]) assert_array_equal(lengths, [7, 2])
def train_model(): print "Loading training data..." X_train, y_train, lengths_train = load_conll( os.path.join(current_dir, "./../data/train.conll"), features) clf = StructuredPerceptron(verbose=True, max_iter=10) describe(X_train, lengths_train) print "Loading test data..." X_test, y_test, lengths_test = load_conll( os.path.join(current_dir, "./../data/test.conll"), features) describe(X_test, lengths_test) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred))) return clf
def load_data(trainingPath, testPath): #files = glob('nerdata/*.bio') #load training file and run through conll sequencer print("Training data loaded from {0}".format(trainingPath)) #only doing glob because the example did above the print statement for glob trainFiles = glob(trainingPath) train_files = [f for i, f in enumerate(trainFiles)] train = load_conll(fileinput.input(train_files), features) #training data and description X_train, _, lengths_train = train describe(X_train, lengths_train) #Filler third column of testing data with fillers because load_conll wants 3 columns #We are only given 2 columns in test data, will not work without newTest = "newTest.txt" postTest = open(newTest, 'w+') with open(testPath) as file: for line in file: if (line != '\n'): #strip \n from end of file line = line.rstrip() #add filler to end of line postTest.write(line + '\t NonApplicable\n') postTest.close() #load test data print("Test data loaded from {0}".format(testPath)) #again, only doing glob because the example did testFiles = glob(newTest) test_files = [f for i, f in enumerate(testFiles)] test = load_conll(fileinput.input(test_files), features) #test data and description X_test, _, lengths_test = test describe(X_test, lengths_test) return train, test
def main(): #Load in training data and pass it through our feature function. #See documentation exact outputs of load_conll samples, labels, sentence_lengths = load_conll("data/gene-trainF18.txt", features, split=True) #Train the model with our features clf = StructuredPerceptron() clf.fit(samples, labels, sentence_lengths) #Evaluate our model test_samples, test_labels, test_sentence_lengths = load_conll( "data/F18-assgn4-test.txt", features, split=True) prediction = clf.predict(test_samples, test_sentence_lengths) #Output results i = 0 j = 1 output = [] for line in open("data/F18-assgn4-test.txt"): if (line == "\n"): output.append("\n") j = 1 continue else: item = str( j) + "\t" + line.split()[1] + "\t" + prediction[i] + "\n" output.append(item) print(item) i += 1 j += 1 with open('predictions.txt', 'w') as f: for item in output: f.write(item)
def test_model(): print("Loading test data...", end=" ") test_files = [f for i, f in enumerate(glob("prepard_test_1.txt"))] test = load_conll(fileinput.input(test_files), sequence_learner_new.features) X_test, _, lengths_test = test sequence_learner_new.describe(X_test, lengths_test) X_test, y_test, lengths_test = test Y = sequence_learner_new.get_labels(y_test) clf = joblib.load('model/seq_labeler.pkl') y_pred = clf.predict(X_test, lengths_test) target = codecs.open("test_res.txt", "w", "utf-8") for i in range(0, X_test.shape[0]): target.write(y_pred[i] + "\n") prec = 0 recall = 0 count = 0 entities = [ "<PER>", "</PER>", "<PER></PER>", "<IPER>", "<ORG>", "</ORG>", "<ORG></ORG>", "<IORG>" ] for i in range(len(y_test)): elems = Y[i].partition("<") entity = elems[1] + elems[2] entity = entity.replace('\n', '') yelem = y_pred[i].partition("<") yent = yelem[1] + yelem[2] if entity in entities and Y[i].replace('\n', '') == y_pred[i]: print(entity) count += 1 prec += 1 elif yent in entities and Y[i] != y_pred[i]: count += 1 if count > 0: print(count) print(" precision = ", (float(prec) / float(count)) * 100, "%") else: print(" precision = 0", prec) print("Accuracy: %.3f" % (100 * accuracy_score(Y, y_pred)))
def main(): print("Loading data") #Useful messages dat = open(sys.argv[1]) # get filename and open the correct file addcol(dat, sys.argv[1]) X_test, y_test, l_test = load_conll( "test.txt", features) # load the test set created by addcol data = load_dat() # yet another file loading function! X_train, y_train, l_train = load_con(data, features) # the big loading file per = StructuredPerceptron(lr_exponent=0.35, max_iter=20, verbose=1) # Some trial and error found that # a lr of .35 and 20 iters worked best print("Fitting") per.fit(X_train, y_train, l_train) # fit and predict y_p = per.predict(X_test, l_test) create_eval_file(y_p) # save print("Done!")
def test_model(): print("Loading test data...", end=" ") test_files = [ f for i, f in enumerate( glob("../training data/labelled test data/resume0.txt")) ] test = load_conll(fileinput.input(test_files), sequence_learner.features) X_test, _, lengths_test = test sequence_learner.describe(X_test, lengths_test) X_test, y_test, lengths_test = test clf = joblib.load('model/seq_labeler.pkl') y_pred = clf.predict(X_test, lengths_test) target = codecs.open("../training data/test data/test results/resume_res", "w", "utf-8") for i in range(0, X_test.shape[0]): target.write(y_pred[i] + "\n") print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))
def test_load_conll_split(): X, y, _ = load_conll(six.moves.StringIO(TEST_SPLIT), features_split, split=True) assert_equal(list(y), list("OBI"))
if i + 1 < len(sentence): yield "word+1:{}" + sentence[i + 1].lower() def describe(X, lengths): print("{0} sequences, {1} tokens.".format(len(lengths), X.shape[0])) if __name__ == "__main__": print(__doc__) if len(sys.argv) < 3: print("Usage: {0} training_file test_file".format(sys.argv[0])) sys.exit(1) print("Loading training data...", end=" ") X_train, y_train, lengths_train = load_conll(sys.argv[1], features) describe(X_train, lengths_train) print("Loading test data...", end=" ") X_test, y_test, lengths_test = load_conll(sys.argv[2], features) describe(X_test, lengths_test) clf = StructuredPerceptron(verbose=True, max_iter=10) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred))) print("CoNLL F1: %.3f" % (100 * bio_f_score(y_test, y_pred)))
if i < len(sequence) - 1: next_ = sequence[i + 1].split("\t")[0] # next word's length yield "next_len=" + str(get_word_len(next_)) # last letters of the next word yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_) yield "next_short_word_shape=" + get_short_word_shape(next_) if i < len(sequence) - 2: nnext = sequence[i + 2].split("\t")[0] yield "nnext_short_word_shape=" + get_short_word_shape(nnext) # читаем обучающее множество X_train, y_train, lengths_train = load_conll( open("finer-data/data/digitoday.2014.train.csv", "r"), features) clf = StructuredPerceptron(decode="bestfirst", verbose=1, random_state=0) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll( open("finer-data/data/digitoday.2014.dev.csv", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev))
f.write(token_tag[1][:-1]+'\t'+'O'+'\n') ''' f = open('input_keys.txt', 'w') for line in open('test-run-test-with-keys.txt', 'r'): if line == '\n': f.write('\n') else: token_tag = line.split('\t') if len(token_tag) >= 3: f.write(token_tag[1] + '\t' + token_tag[2]) ''' if __name__ == "__main__": print(__doc__) load_data() print("Loading training data...", end=" ") X_train, y_train, lengths_train = load_conll('input_train.txt', features) describe(X_train, lengths_train) print("Loading test data...", end=" ") X_test, y_test, lengths_test = load_conll('input_test.txt', features) describe(X_test, lengths_test) clf = StructuredPerceptron(verbose=True, lr_exponent=0.1, max_iter=30) print("Training %s" % clf) clf.fit(X_train, y_train, lengths_train) y_pred = clf.predict(X_test, lengths_test) ''' f = open('input_test_key.txt', 'w') for i in y_pred:
yield "prev_word_shape=" + get_word_shape(prev) yield "prev_short_word_shape=" + get_short_word_shape(prev) if i < len(sequence) - 1: next_ = sequence[i + 1].split("\t")[0] # next word's length yield "next_len=" + str(get_word_len(next_)) # last letters of the next word yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_) yield "next_word_shape=" + get_word_shape(next_) yield "next_short_word_shape=" + get_short_word_shape(next_) # читаем обучающее множество X_train, y_train, lengths_train = load_conll( open("ftb1u-v1/ftb1u_train.tsv", "r"), features) clf = StructuredPerceptron(decode="viterbi", verbose=1) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on test set") # читаем тестовое множество X_test, y_test, lengths_test = load_conll(open("ftb1u-v1/ftb1u_test.tsv", "r"), features) y_pred = clf.predict(X_test, lengths_test) print("Whole seq accuracy ", whole_sequence_accuracy(y_test, y_pred, lengths_test)) print("Element-wise accuracy ", accuracy_score(y_test, y_pred))
c = ch.join(' ') file.write(c) file.write('\n') file.close() clf = StructuredPerceptron() def features(sequence, i): yield "word=" + sequence[i].lower() if sequence[i].isupper(): yield "Uppercase" X_train, y_train, lengths_train = load_conll("training_seq.txt", features) clf = StructuredPerceptron() clf.fit(X_train, y_train, lengths_train) ''' names of the files that wanted to be test on ''' predict_list_name = ["seq1.txt", "seq2.txt"] predict_seq = [] output_list_name = ["prediction1.txt"] count = 0 for name in predict_list_name: file = open(name, "r")
yield "prev_word_shape=" + get_word_shape(prev) yield "prev_short_word_shape=" + get_short_word_shape(prev) if i < len(sequence) - 1: next_ = sequence[i + 1].split("\t")[0] # next word's length yield "next_len=" + str(get_word_len(next_)) # last letters of the next word yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_) yield "next_word_shape=" + get_word_shape(next_) yield "next_short_word_shape=" + get_short_word_shape(next_) # читаем обучающее множество X_train, y_train, lengths_train = load_conll( open("resources/talbanken-stanford-1.2/talbanken-stanford-train.tsv", "r"), features) clf = StructuredPerceptron(decode="viterbi", verbose=1, random_state=0) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on test set") # читаем тестовое множество X_test, y_test, lengths_test = load_conll( open("resources/talbanken-stanford-1.2/talbanken-stanford-test.tsv", "r"), features) y_pred = clf.predict(X_test, lengths_test) print("Whole seq accuracy ", whole_sequence_accuracy(y_test, y_pred, lengths_test)) print("Element-wise accuracy ", accuracy_score(y_test, y_pred)) print("Mean F1-score macro ", f1_score(y_test, y_pred, average="macro"))
if i < len(sequence) - 1: next = sequence[i + 1].split("\t")[1] # next word's length yield "next_len=" + str(get_word_len(next)) if i < len(sequence) - 1: next = sequence[i + 1].split("\t")[1] # last letters of the next word yield "next_last_letters=" + (next[-3:] if len(next) > 3 else next) if i < len(sequence) - 1: next = sequence[i + 1].split("\t")[1] yield "next_short_word_shape=" + get_short_word_shape(next) # читаем обучающее множество X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features) clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev)) print("Element-wise accuracy ", accuracy_score(y_dev, y_pred)) print("Mean F1-score macro ", f1_score(y_dev, y_pred, average="macro"))
yield "folUpper" if re.search(r"\d", nnp.lower()): yield "folNumber" yield "folword=" + nnp.lower() if p.isupper() and len(p) == 3: yield "Uppercase" if re.search(r"\d", p.lower()): yield "Number" if len(p) > 8: # check if current word is unusually long yield "Long" if __name__ == '__main__': train_path = "../Data/bio-ner/train" dev_path = "../Data/bio-ner/dev" # create_file(train_path, "train") # create_file(dev_path, "dev") X_train, y_train, l_train = load_conll("train", features) X_test, y_test, l_test = load_conll("dev", features) per = StructuredPerceptron(lr_exponent=0.15, max_iter=300, verbose=1) per.fit(X_train, y_train, l_train) y_p = per.predict(X_test, l_test) # for x in zip(y_p, y_test): # print(x) print(bio_f_score(y_test, y_p))