def _word_seg(text, word_seg_crf_model): sentence = tokenize(text).split() X_test = [sent2features(sentence, 'raw')] start = time.time() IOBtag = word_seg_crf_model.predict(X_test) print("Executed time for segmentating: " + str(time.time() - start)) output = [] for tag, token in zip(IOBtag[0], sentence): if tag == "I_W": output[-1] = output[-1] + u" " + token else: output.append(token) return output
# args = parse_argument() # MODEL_NAME = args.n + pkl MODEL_NAME = "ws.pkl" model_path = "./models/" print("=======================================") print("Reading testing data ....") test_sents = [] filename_test = "test.txt" with open(filename_test, "r") as ftrue: raw_data = ftrue.read().strip("\n").strip(" ").split("\n\n") for sent in raw_data: test_sents.append([xxx.replace("B-W", "B_W").replace("I-W", "I_W").replace("o", "O").split("\t") for xxx in sent.split("\n")]) X_test = [sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] y_test = [sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] print("Done!") print("=======================================") print("Load the model ...") model = pickle.load(file=open(os.path.join(model_path, MODEL_NAME), "rb")) print(model) print("Done loading", MODEL_NAME) print("=======================================") print("Testing ...") score = model.score(X_test, y_test) y_pred = model.predict(X_test) sum_accuracy = 0
if __name__ == "__main__": args = parse_argument() # Model name, duplication check MODEL_NAME = checkModelFileExistAndCreateNewModelName( args.m.replace(".pkl", "") + ".pkl") reader = Reader("/data/train") reader_test = Reader("/data/test") test_sents = reader_test.read('10000') # 10000 is the dataset train_sents = reader.read('10000') # 10000 is the dataset X_train = [ sent2features(sent, "train") for sent in train_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0] ] y_train = [ sent2labels(sent) for sent in train_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0] ] X_test = [ sent2features(sent, "test") for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0] ] y_test = [ sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0] ]
filename_true = "/home/enamoria/Desktop/workspace/evaluation/ws/data/train_with_dict.txt" train_sents = read_data(filename_true) dev_sents = read_data( "/home/enamoria/Desktop/workspace/evaluation/ws/data/dev.txt") print(len(train_sents), len(dev_sents)) train_sents.extend(dev_sents) print(len(train_sents)) # with open(filename_true, "r") as ftrue: # raw_data = ftrue.read().strip("\n").strip(" ").split("\n\n") # for sent in raw_data: # train_sents.append([xxx.replace("B-W", "B_W").replace("I-W", "I_W").replace("o", "O").split("\t") for xxx in sent.split("\n")]) # Feature extracting from templates X_train = [sent2features(sent, mode="train") for sent in train_sents ] # if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] y_train = [sent2labels(sent) for sent in train_sents ] # len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] X_dev = [sent2features(sent, mode="dev") for sent in dev_sents ] # if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] y_dev = [sent2labels(sent) for sent in dev_sents] # X_test = [sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] # y_test = [sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] # X_test = [sent2features(sent) for sent in test_sents] # y_test = [sent2labels(sent) for sent in test_sents] # Training ...
return [y[1] for y in sent] def sent2tokens(sent): return [y[0] for y in sent] if __name__ == "__main__": reader = Reader("/data/test") test_sents = reader.read('10000') print(test_sents[1]) MODEL_NAME = "pos_8.pkl" X_test = [ sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0] ] y_test = [ sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0] ] loaded_model = pickle.load( open(MODEL_NAME.replace(".pkl", "") + ".pkl", 'rb')) print("Loaded", MODEL_NAME, loaded_model) print(loaded_model.classes_) sum = 0.0 y_pred = loaded_model.predict(X_test)
if __name__ == "__main__": filename_true = "train.txt" # fielname_pred = "pred.iob" MODEL_NAME = checkModelFileExistAndCreateNewModelName(MODEL_NAME) # Read data train_sents = [] with open(filename_true, "r") as ftrue: raw_data = ftrue.read().strip("\n").strip(" ").split("\n\n") for sent in raw_data: train_sents.append([xxx.split("\t") for xxx in sent.split("\n")]) # Feature extracting from templates X_train = [sent2features(sent) for sent in train_sents ] # if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] y_train = [sent2labels(sent) for sent in train_sents ] # len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] # print(X_train) # print(y_train) # X_test = [sent2features(sent, 'test') for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] # y_test = [sent2labels(sent) for sent in test_sents if len(sent) >= 1 and len(sent[0]) >= 1 and sent[0][0]] # X_test = [sent2features(sent) for sent in test_sents] # y_test = [sent2labels(sent) for sent in test_sents] # Training ... print("=======================================")