default=0, help='write the test result, set 1 for writing result to file') parser.add_argument('--set', type=int, default=0, help='test set or valid set') parser.add_argument('--flag', default="PER", help='entity type (PER/LOC/ORG/MISC)') parser.add_argument('--lr_rate', type=int, default=1e-4, help='learning rate') parser.add_argument('--dataset', default="conll2003", help='name of the dataset') parser.add_argument('--type', default="bnpu", help='pu learning type (bnpu/bpu/upu)') args = parser.parse_args() dp = DataPrepare(args.dataset) mutils = FeaturedDetectionModelUtils(dp) trainSet, validSet, testSet, prior = mutils.load_dataset( args.flag, args.dataset, 1.0) setIter = [testSet, validSet] detectionSet = setIter[args.set] fileNameIter = [ "data/" + args.dataset + "/test.txt", "data/" + args.dataset + "/valid.txt" ] fileName = fileNameIter[args.set] charcnn = CharCNN(dp.char2Idx) wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx) casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx) featurenet = FeatureNet()
parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--beta', type=float, default=0.0) parser.add_argument('--gamma', type=float, default=1.0) parser.add_argument('--drop_out', type=float, default=0.5) parser.add_argument('--m', type=float, default=0.3) parser.add_argument('--p', type=float, default=0.046) parser.add_argument('--flag', default="PER") parser.add_argument('--dataset', default="conll2003") parser.add_argument('--unlabel_size', type=int, default=10000) parser.add_argument('--iter', type=int, default=1) parser.add_argument('--model', default="") args = parser.parse_args() dp = DataPrepare(args.dataset) mutils = AddUnlabeledModelUtils(dp) # use for training dataset = mutils.load_dataset_(args.flag, args.dataset, args.iter) trainSet, validSet, testSet, prior = mutils.load_origin_dataset( dataset, args.p) train_sentences_X, train_sentences_Y, train_sentences_LF = zip(*trainSet) trainSize = len(trainSet) validSize = len(validSet) testSize = len(testSet) print(("train set size: {}, valid set size: {}, test set size: {}").format( trainSize, validSize, testSize)) charcnn = CharCNN(dp.char2Idx)
os.makedirs(fdir) fname = os.path.join(fdir, "train.txt") allSentences = [] for sent in trainSentences: allSentences.append(sent) for sent in unlabeledSentences: allSentences.append(sent) shuffle_indices = np.random.permutation(np.arange(len(allSentences))) allSentences = np.array(allSentences)[shuffle_indices] with open(fname, "w") as fw: for sentence in allSentences: for word, label, tagIdxList in sentence: labeled = 0 fw.write(word + " " + str(label) + " " + str(labeled) + "\n") fw.write("\n") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="PU NER") # data parser.add_argument('--dataset', default="conll2003") args = parser.parse_args() dp = DataPrepare(args.dataset) make_unlabeled_data(dp, "data/" + args.dataset + "/train.txt", "data/eng.raw", args.dataset) print("Data Set Writing Successfully")
def main_feature(): parser = argparse.ArgumentParser(description="PU NER") # data parser.add_argument('--lr', type=float, default=1e-4, help='learning rate') parser.add_argument('--beta', type=float, default=0.0, help='beta of pu learning (default 0.0)') parser.add_argument('--gamma', type=float, default=1.0, help='gamma of pu learning (default 1.0)') parser.add_argument('--drop_out', type=float, default=0.5, help='dropout rate') parser.add_argument('--m', type=float, default=0.3, help='class balance rate') parser.add_argument('--flag', default="Disease", help='entity type (PER/LOC/ORG/MISC)') parser.add_argument('--dataset', default="disease", help='name of the dataset') parser.add_argument('--batch_size', type=int, default=100, help='batch size for training and testing') parser.add_argument('--print_time', type=int, default=1, help='epochs for printing result') parser.add_argument('--pert', type=float, default=1.0, help='percentage of data use for training') parser.add_argument('--type', type=str, default='bnpu', help='pu learning type (bnpu/bpu/upu)') # bpu upu parser.add_argument('--num', type=str, default='0', help='file number') args = parser.parse_args() dp = DataPrepare(args.dataset) mutils = FeaturedDetectionModelUtils(dp) trainSet, validSet, testSet, prior = mutils.load_dataset( args.flag, args.dataset, args.pert, args.num) f1_best_train = 0 trainSize = len(trainSet) validSize = len(validSet) testSize = len(testSet) print(("train set size: {}, valid set size: {}, test set size: {}").format( trainSize, validSize, testSize)) charcnn = CharCNN(dp.char2Idx) wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx) casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx) featurenet = FeatureNet() pulstmcnn = PULSTMCNN(dp, charcnn, wordnet, casenet, featurenet, 150, 200, 1, args.drop_out) if torch.cuda.is_available: charcnn.cuda() wordnet.cuda() casenet.cuda() featurenet.cuda() pulstmcnn.cuda() torch.cuda.manual_seed(1013) trainer = Trainer(pulstmcnn, prior, args.beta, args.gamma, args.lr, args.m) time = 0 bar = ProgressBar(maxval=int((len(trainSet) - 1) / args.batch_size)) train_sentences = dp.read_origin_file("data/" + args.dataset + "/train.txt") trainSize = int(len(train_sentences) * args.pert) train_sentences = train_sentences[:trainSize] train_words = [] train_efs = [] for s in train_sentences: temp = [] temp2 = [] for word, ef, lf in s: temp.append(word) temp2.append(ef) train_words.append(temp) train_efs.append(temp2) valid_sentences = dp.read_origin_file("data/" + args.dataset + "/valid.txt") valid_words = [] valid_efs = [] for s in valid_sentences: temp = [] temp2 = [] for word, ef, lf in s: temp.append(word) temp2.append(ef) valid_words.append(temp) valid_efs.append(temp2) test_sentences = dp.read_origin_file("data/" + args.dataset + "/test.txt") test_words = [] test_efs = [] for s in test_sentences: temp = [] temp2 = [] for word, ef, lf in s: temp.append(word) temp2.append(ef) test_words.append(temp) test_efs.append(temp2) for e in range(1, 1000): print("Epoch: {}".format(e)) bar.start() risks = [] prisks = [] nrisks = [] for step, (x_word_batch, x_case_batch, x_char_batch, x_feature_batch, y_batch, flag_batch) in enumerate( mutils.iterateSet(trainSet, batchSize=args.batch_size, mode="TRAIN")): bar.update(step) batch = [ x_word_batch, x_case_batch, x_char_batch, x_feature_batch, y_batch, flag_batch ] acc, risk, prisk, nrisk = trainer.train_mini_batch(batch, args) risks.append(risk) prisks.append(prisk) nrisks.append(nrisk) meanRisk = np.mean(np.array(risks)) meanRisk2 = np.mean(np.array(prisks)) meanRisk3 = np.mean(np.array(nrisks)) print("risk: {}, prisk: {}, nrisk: {}".format(meanRisk, meanRisk2, meanRisk3)) if e % 5 == 0: trainer.decay_learning_rate(e, args.lr) if e % args.print_time == 0: # train set pred_train = [] corr_train = [] for step, (x_word_train_batch, x_case_train_batch, x_char_train_batch, x_feature_train_batch, y_train_batch) in enumerate( mutils.iterateSet(trainSet, batchSize=100, mode="TEST", shuffle=False)): trainBatch = [ x_word_train_batch, x_case_train_batch, x_char_train_batch, x_feature_train_batch ] correcLabels = [] for x in y_train_batch: for xi in x: correcLabels.append(xi) lengths = [len(x) for x in x_word_train_batch] predLabels, _ = trainer.test(trainBatch, lengths) correcLabels = np.array(correcLabels) # print(predLabels) # print(correcLabels) assert len(predLabels) == len(correcLabels) start = 0 for i, l in enumerate(lengths): end = start + l p = predLabels[start:end] c = correcLabels[start:end] pred_train.append(p) corr_train.append(c) start = end # prec, rec, f1 = dp.evaluate_ner_tagging("data/" + args.dataset + "/train.txt",pred_train) newSentences = [] for i, s in enumerate(train_words): sent = [] assert len(s) == len(train_efs[i]) == len(pred_train[i]) for j, item in enumerate(s): sent.append([item, train_efs[i][j], pred_train[i][j]]) newSentences.append(sent) newSentences_, newLabels, newPreds = dp.wordLevelGeneration( newSentences) print("newlabels : ", len(newLabels)) print("newpreds : ", len(newPreds)) print(newLabels[0]) p_train, r_train, f1_train = dp.compute_precision_recall_f1( newLabels, newPreds, args.flag, 1) print("Precision: {}, Recall: {}, F1: {}".format( p_train, r_train, f1_train)) print(trainer.bestResult) if f1_train > f1_best_train: f1_best_train = f1_train p_best_train = p_train r_best_train = f1_train p_train_partial, r_train_partial, f1_train_partial, partial_entity_train, partial_ratio_train = dp.compute_precision_recall_f1_partial( newLabels, newPreds, args.flag, 1, True) # valid set pred_valid = [] corr_valid = [] for step, (x_word_test_batch, x_case_test_batch, x_char_test_batch, x_feature_test_batch, y_test_batch) in enumerate( mutils.iterateSet(validSet, batchSize=100, mode="TEST", shuffle=False)): validBatch = [ x_word_test_batch, x_case_test_batch, x_char_test_batch, x_feature_test_batch ] correcLabels = [] for x in y_test_batch: for xi in x: correcLabels.append(xi) lengths = [len(x) for x in x_word_test_batch] predLabels, _ = trainer.test(validBatch, lengths) correcLabels = np.array(correcLabels) assert len(predLabels) == len(correcLabels) start = 0 for i, l in enumerate(lengths): end = start + l p = predLabels[start:end] c = correcLabels[start:end] pred_valid.append(p) corr_valid.append(c) start = end newSentencesValid = [] for i, s in enumerate(valid_words): sent = [] assert len(s) == len(valid_efs[i]) == len(pred_valid[i]) for j, item in enumerate(s): sent.append([item, valid_efs[i][j], pred_valid[i][j]]) newSentencesValid.append(sent) newSentencesValid_, newLabelsValid, newPredsValid = dp.wordLevelGeneration( newSentencesValid) p_valid, r_valid, f1_valid = dp.compute_precision_recall_f1( newLabelsValid, newPredsValid, args.flag, 1) print("Precision: {}, Recall: {}, F1: {}".format( p_valid, r_valid, f1_valid)) if f1_valid <= trainer.bestResult: time += 1 else: trainer.bestResult = f1_valid time = 0 trainer.bestResult = f1_valid best_precision_valid = p_valid best_rappel_valid = r_valid p_valid_partial, r_valid_partial, f1_valid_partial, partial_entity_valid, partial_ratio_valid = dp.compute_precision_recall_f1_partial( newLabelsValid, newPredsValid, args.flag, 1, True) trainer.save(( "saved_model/{}_{}_{}_{}_lr_{}_prior_{}_beta_{}_gamma_{}_percent_{}" ).format(args.num, args.type, args.dataset, args.flag, trainer.learningRate, trainer.m, trainer.beta, trainer.gamma, args.pert)) if time > 5: f_result_valid = open('result_val.txt', 'a') print(("BEST RESULT ON VALIDATE DATA:{}").format( trainer.bestResult), file=f_result_valid) break pulstmcnn.load_state_dict( torch.load( "saved_model/{}_{}_{}_{}_lr_{}_prior_{}_beta_{}_gamma_{}_percent_{}" .format(args.num, args.type, args.dataset, args.flag, trainer.learningRate, trainer.m, trainer.beta, trainer.gamma, args.pert))) pred_test = [] corr_test = [] for step, (x_word_test_batch, x_case_test_batch, x_char_test_batch, x_feature_test_batch, y_test_batch) in enumerate( mutils.iterateSet(testSet, batchSize=100, mode="TEST", shuffle=False)): testBatch = [ x_word_test_batch, x_case_test_batch, x_char_test_batch, x_feature_test_batch ] correcLabels = [] for x in y_test_batch: for xi in x: correcLabels.append(xi) lengths = [len(x) for x in x_word_test_batch] predLabels, _ = trainer.test(testBatch, lengths) correcLabels = np.array(correcLabels) assert len(predLabels) == len(correcLabels) start = 0 for i, l in enumerate(lengths): end = start + l p = predLabels[start:end] c = correcLabels[start:end] pred_test.append(p) corr_test.append(c) start = end newSentencesTest = [] for i, s in enumerate(test_words): sent = [] assert len(s) == len(test_efs[i]) == len(pred_test[i]) for j, item in enumerate(s): sent.append([item, test_efs[i][j], pred_test[i][j]]) newSentencesTest.append(sent) newSentencesValid_, newLabelsValid, newPredsValid = dp.wordLevelGeneration( newSentencesTest) p_valid, r_valid, f1_valid = dp.compute_precision_recall_f1( newLabelsValid, newPredsValid, args.flag, 1) p_test_partial, r_test_partial, f1_test_partial, partial_entity_test, partial_ratio_test = dp.compute_precision_recall_f1_partial( newLabelsValid, newPredsValid, args.flag, 1, True) dutils = utils.dict_utils.DictUtils() inDico = get_dico(dp, dutils, "data/disease/test.txt", "fulldico.txt", "Disease", "Train", "disease") p_test_gene, r_test_gene, f1_test_gene, partial_test_gene, rpartial_test_gene, f1partial_test_gene, nb_notindico = dp.compute_metrics_generalize( newLabelsValid, newPredsValid, args.flag, 1, inDico) result_file = "results/" + args.num + "_final_results.txt" with open(result_file, "a+") as fw: print(args.lr, args.beta, args.gamma, args.drop_out, args.m, file=fw) print(p_best_train, r_best_train, f1_best_train, file=fw) print(p_train_partial, r_train_partial, f1_train_partial, partial_entity_train, partial_ratio_train, file=fw) print(best_precision_valid, best_rappel_valid, trainer.bestResult, file=fw) print(p_valid_partial, r_valid_partial, f1_valid_partial, partial_entity_valid, partial_ratio_valid, file=fw) print(p_valid, r_valid, f1_valid, file=fw) print(p_test_partial, r_test_partial, f1_test_partial, partial_entity_test, partial_ratio_test, file=fw) print(p_test_gene, r_test_gene, f1_test_gene, file=fw) print(partial_test_gene, rpartial_test_gene, f1partial_test_gene, file=fw) print(nb_notindico, file=fw)
if __name__ == "__main__": parser = argparse.ArgumentParser(description="PU NER EVL") parser.add_argument('--dataset', default="conll2003") parser.add_argument('--type', default="bnpu") args = parser.parse_args() filenames = [ 'result/' + args.type + '_feature_pu_' + args.dataset + '_PER_0.txt', 'result/' + args.type + '_feature_pu_' + args.dataset + '_LOC_0.txt', 'result/' + args.type + '_feature_pu_' + args.dataset + '_ORG_0.txt', 'result/' + args.type + '_feature_pu_' + args.dataset + '_MISC_0.txt' ] origin_file = "data/" + args.dataset + "/test.txt" dp = DataPrepare(args.dataset) test_sentences = dp.read_origin_file(origin_file) test_words = [] test_efs = [] lens = [] for s in test_sentences: temp = [] temp2 = [] for word, ef, lf in s: temp.append(word) temp2.append(ef) test_words.append(temp) test_efs.append(temp2) lens.append(len(s))
from sub_model import CharCNN, CaseNet, WordNet import numpy as np torch.manual_seed(10) parser = argparse.ArgumentParser(description="PU NER") parser.add_argument('--model', default="") parser.add_argument('--output', default=0) parser.add_argument('--set', type=int, default=0) parser.add_argument('--flag', default="PER") parser.add_argument('--lr_rate', type=int, default=1e-4) parser.add_argument('--dataset', default="conll2003") parser.add_argument('--pert', type=float, default=1.0) args = parser.parse_args() dp = DataPrepare(args.dataset) mutils = SupervisedModelUtils(dp) trainSet, validSet, testSet = mutils.load_dataset(args.flag, args.dataset, args.pert) setIter = [testSet, validSet] detectionSet = setIter[args.set] fileNameIter = [ "data/" + args.dataset + "/test.txt", "data/" + args.dataset + "/valid.txt" ] fileName = fileNameIter[args.set] charcnn = CharCNN(dp.char2Idx) wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx) casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx)
if __name__ == "__main__": from utils.add_unlabeled_data_model_utils import AddUnlabeledModelUtils from utils.data_utils import DataPrepare dp = DataPrepare("conll2003") temp = AddUnlabeledModelUtils(dp) temp.load_dataset_("PER", "conll2003", 10000) temp.load_dataset_("PER", "conll2003", 20000)
parser.add_argument('--gamma', type=float, default=1.0, help='gamma of pu learning (default 1.0)') parser.add_argument('--drop_out', type=float, default=0.5, help='dropout rate') parser.add_argument('--m', type=float, default=1.5, help='class balance rate') parser.add_argument('--p', type=float, default=0.1, help='estimate value of prior') parser.add_argument('--flag', default="Disease", help='entity type (PER/LOC/ORG/MISC)') parser.add_argument('--dataset', default="disease", help='name of the dataset') parser.add_argument('--lr', type=float, default=1e-4, help='learning rate') parser.add_argument('--batch_size', type=int, default=100, help='batch size for training and testing') parser.add_argument('--output', default=0, help='write the test result, set 1 for writing result to file') parser.add_argument('--num', type=str, default='', help='file number') parser.add_argument('--model', default="", help='saved model name') parser.add_argument('--iter', type=int, default=1, help='iteration time') args = parser.parse_args() dp = DataPrepare(args.dataset) mutils = AdaptivePUUtils(dp) dutils = DictUtils() trainSet, validSet, testSet, prior = mutils.load_new_dataset(args.flag, args.dataset, args.iter, args.p, args.num) print(prior) trainSize = len(trainSet) validSize = len(validSet) testSize = len(testSet) print(("train set size: {}, valid set size: {}, test set size: {}").format(trainSize, validSize, testSize)) charcnn = CharCNN(dp.char2Idx) wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx) casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx) featurenet = FeatureNet() pulstmcnn = AdaPULSTMCNN2(dp, charcnn, wordnet, casenet, featurenet, 150, 200, 1, args.drop_out)
for word, ef, lf in sentence: alltokens += 1 if ef == 1: labeledtokens += 1 else: unlabeledtokens += 1 prior = float(labeledtokens / alltokens) assert alltokens == labeledtokens + unlabeledtokens print(prior) if __name__ == "__main__": dataset = "muc" flag = "PER" dp = DataPrepare(dataset) filename = "data/" + dataset + "/train." + flag + ".txt" sentences = dp.read_processed_file(filename, flag) compute_all_prior(sentences) flag = "LOC" filename = "data/" + dataset + "/train." + flag + ".txt" sentences = dp.read_processed_file(filename, flag) compute_all_prior(sentences) flag = "ORG" filename = "data/" + dataset + "/train." + flag + ".txt" sentences = dp.read_processed_file(filename, flag) compute_all_prior(sentences) # flag = "MISC" # filename = "data/" + dataset + "/train." + flag + ".txt" # sentences = dp.read_processed_file(filename, flag) # compute_all_prior(sentences)