def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: print('Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec(conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.build_label_idx(trains) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) dataset, max_length, label_length = reader.read_trigger_txt( conf.trigger_file, -1) reader.merge_labels(dataset) trains = reader.read_txt(conf.train_all_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) print(len(dataset)) if conf.context_emb == ContextEmb.bert: print('Loading the BERT vectors for all datasets.') conf.context_emb_size = load_bert_vec( conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(trains) conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer conf.build_label_idx(dataset) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(dataset) conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) dataset = reader.trigger_percentage(dataset, conf.percentage) encoder = SoftMatcher(conf, label_length) trainer = SoftMatcherTrainer(encoder, conf, devs, tests) # matching module training random.shuffle(dataset) trainer.train_model(conf.num_epochs_soft, dataset) logits, predicted, triggers = trainer.get_triggervec(dataset) # all the trigger vectors, trigger type, string name of the trigger triggers_remove = remove_duplicates(logits, predicted, triggers, dataset) numbers = int(len(trains) * (1 - opt.unlabeled_percentage)) print("number of train instances : ", numbers) initial_trains = trains[:numbers] unlabeled_x = trains[numbers:] for data in unlabeled_x: data.output_ids = None # sequence labeling module self-training random.shuffle(dataset) inference = SoftSequence(conf, encoder) sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests, triggers_remove) sequence_trainer.self_training(conf.num_epochs, dataset, unlabeled_x)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: logging.info('[Data Info] Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) # conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains + devs + tests) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(devs + tests) logging.info("[Data Info] num chars: " + str(conf.num_char)) logging.info("[Data Info] num words: " + str(len(conf.word2idx))) logging.info( f"[Data Info] Removing {conf.entity_keep_ratio*100}% of entities from the training set" ) logging.info("[Data Info] Removing the entities") ## it will return the set of removed entities (for debug purpose) _ = remove_entites(trains, conf) # logging.info(f"entities removed: {span_set}") conf.map_insts_ids(trains) random.shuffle(trains) for inst in trains: inst.is_prediction = [False] * len(inst.input) if conf.variant == "soft": inst.marginals = np.full((len(inst.input), conf.label_size), -1e10) for pos, label in enumerate(inst.output): if label == conf.O: inst.is_prediction[pos] = True if conf.variant == "soft": inst.marginals[pos, conf.label2idx[label]] = 0 num_insts_in_fold = math.ceil(len(trains) / conf.num_folds) trains = [ trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold] for i in range(conf.num_folds) ] train_model(config=conf, train_insts=trains, dev_insts=devs, test_insts=tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) if "ontonotes" in conf.train_file: trains = reader.read_conll(conf.train_file, conf.train_num) devs = reader.read_conll(conf.dev_file, conf.dev_num) tests = reader.read_conll(conf.test_file, conf.test_num) else: trains = reader.read_txt(conf.train_file, conf.train_num) if conf.typing_model: devs = reader.read_txt_with_extraction(conf.dev_file, conf.dev_extraction, conf.dev_num) tests = reader.read_txt_with_extraction(conf.test_file, conf.test_extraction, conf.test_num) else: devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: print('Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains + devs + tests) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains + devs + tests) if conf.typing_model: """ Building mapping, for example: {B-per: [B-per, B-org, B-misc], O: O, I-org: [I-per, I-org]} Will be used when creating the mask """ conf.typing_map = build_type_id_mapping(conf) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.static_context_emb != ContextEmb.none: print('Loading the static ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.static_context_emb.name + ".vec", trains) load_elmo_vec( conf.dev_file + "." + conf.static_context_emb.name + ".vec", devs) load_elmo_vec( conf.test_file + "." + conf.static_context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains + devs + tests) if conf.embedder_type == "normal": conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) print("[Data Info] num chars: " + str(conf.num_char)) # print(str(conf.char2idx)) print("[Data Info] num words: " + str(len(conf.word2idx))) # print(config.word2idx) else: """ If we use the pretrained model from transformers we need to use the pretrained tokenizer """ print( colored( f"[Data Info] Tokenizing the instances using '{conf.embedder_type}' tokenizer", "red")) tokenize_instance( context_models[conf.embedder_type]["tokenizer"].from_pretrained( conf.embedder_type), trains + devs + tests, conf.label2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) #set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: print('[Data Info] Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.use_iobes_gold(trains) conf.build_label_idx(trains + devs + tests) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(devs + tests) print("[Data Info] num chars: " + str(conf.num_char)) print("[Data Info] num words: " + str(len(conf.word2idx))) conf.map_insts_ids(trains) conf.get_gold_label_ids(trains) random.shuffle(trains) for inst in trains: inst.is_prediction = [False] * len(inst.input) for pos, label in enumerate(inst.output): if label == conf.O: inst.is_prediction[pos] = True num_insts_in_fold = math.ceil(len(trains) / conf.num_folds) trains = [ trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold] for i in range(conf.num_folds) ] train_model(config=conf, train_insts=trains, dev_insts=devs, test_insts=tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num, conf.category) devs = reader.read_txt(conf.dev_file, conf.dev_num, conf.category) tests = reader.read_txt(conf.test_file, conf.test_num, conf.category) if conf.context_emb not in [ContextEmb.none, ContextEmb.mbert]: print('Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains) conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(trains + devs + tests) if conf.context_emb == ContextEmb.mbert: from tokenizers import BertWordPieceTokenizer conf.bert_path = f'data/{conf.dataset}/distilbert-base-uncased' tokenizer = BertWordPieceTokenizer(f'{conf.bert_path}/vocab.txt', lowercase=True) conf.map_tokens_ids(trains, tokenizer) conf.map_tokens_ids(devs, tokenizer) conf.map_tokens_ids(tests, tokenizer) else: conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num, True) devs = reader.read_txt(conf.dev_file, conf.dev_num, False) tests = reader.read_txt(conf.test_file, conf.test_num, False) if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains) conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(trains) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() ids_train = conf.map_insts_ids(trains) ids_dev = conf.map_insts_ids(devs) ids_test = conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)
def main(): logging.info("Transformer implementation") parser = argparse.ArgumentParser( description="Transformer CRF implementation") opt = parse_arguments_t(parser) conf = Config(opt) conf.train_file = conf.dataset + "/train.txt" conf.dev_file = conf.dataset + "/valid.txt" os.environ['CUDA_VISIBLE_DEVICES'] = opt.device_num # data reader reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) # set logger utils.set_logger(os.path.join(conf.model_folder, 'train.log')) # params for k in opt.__dict__: logging.info(k + ": " + str(opt.__dict__[k])) # read trains/devs logging.info("\n") logging.info("Loading the datasets...") trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) logging.info("Building label idx ...") # build label2idx and idx2label conf.build_label_idx(trains + devs) random.shuffle(trains) # set the prediction flag, if is_prediction is False, we will not update this label. for inst in trains: inst.is_prediction = [False] * len(inst.input) for pos, label in enumerate(inst.output): if label == conf.O: inst.is_prediction[pos] = True # dividing the data into 2 parts(num_folds default to 2) num_insts_in_fold = math.ceil(len(trains) / conf.num_folds) trains = [ trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold] for i in range(conf.num_folds) ] train_model(config=conf, train_insts=trains, dev_insts=devs)
return args parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) # train_file = 'data/annotated_ner_data/StackOverflow/train.txt' # dev_file = 'data/annotated_ner_data/StackOverflow/dev.txt' # test_file = 'data/annotated_ner_data/StackOverflow/test.txt' # dataset = reader.read_txt(train_file, -1) # devs = reader.read_txt(dev_file, -1) # tests = reader.read_txt(test_file, -1) dataset = reader.read_txt(conf.train_all_file, -1) # devs = reader.read_txt(conf.dev_file, -1) tests = reader.read_txt(conf.test_file, -1) print(len(dataset)) # setting for data conf.use_iobes(dataset) # conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(dataset) conf.build_word_idx(dataset, None, tests) conf.build_emb_table() conf.map_insts_ids(dataset) # conf.map_insts_ids(devs)
args = parser.parse_args() for k in args.__dict__: print(k + ": " + str(args.__dict__[k])) return args parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) dataset, max_length, label_length = reader.read_trigger_txt( conf.trigger_file, -1) reader.merge_labels(dataset) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) print(len(dataset)) if conf.context_emb == ContextEmb.bert: print('Loading the BERT vectors for all datasets.') conf.context_emb_size = load_bert_vec( conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer conf.build_label_idx(dataset) conf.build_word_idx(dataset, devs, tests)
parser.add_argument('--percentage', type=int, default=100, help="how much percentage of training dataset to use") args = parser.parse_args() for k in args.__dict__: print(k + ": " + str(args.__dict__[k])) return args parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) dataset = reader.read_txt(conf.train_file, conf.dev_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) print(len(dataset)) if conf.context_emb == ContextEmb.bert: print('Loading the BERT vectors for all datasets.') conf.context_emb_size = load_bert_vec( conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer conf.build_label_idx(dataset)