def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: print('Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec(conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.build_label_idx(trains) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: logging.info('[Data Info] Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) # conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains + devs + tests) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(devs + tests) logging.info("[Data Info] num chars: " + str(conf.num_char)) logging.info("[Data Info] num words: " + str(len(conf.word2idx))) logging.info( f"[Data Info] Removing {conf.entity_keep_ratio*100}% of entities from the training set" ) logging.info("[Data Info] Removing the entities") ## it will return the set of removed entities (for debug purpose) _ = remove_entites(trains, conf) # logging.info(f"entities removed: {span_set}") conf.map_insts_ids(trains) random.shuffle(trains) for inst in trains: inst.is_prediction = [False] * len(inst.input) if conf.variant == "soft": inst.marginals = np.full((len(inst.input), conf.label_size), -1e10) for pos, label in enumerate(inst.output): if label == conf.O: inst.is_prediction[pos] = True if conf.variant == "soft": inst.marginals[pos, conf.label2idx[label]] = 0 num_insts_in_fold = math.ceil(len(trains) / conf.num_folds) trains = [ trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold] for i in range(conf.num_folds) ] train_model(config=conf, train_insts=trains, dev_insts=devs, test_insts=tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) if "ontonotes" in conf.train_file: trains = reader.read_conll(conf.train_file, conf.train_num) devs = reader.read_conll(conf.dev_file, conf.dev_num) tests = reader.read_conll(conf.test_file, conf.test_num) else: trains = reader.read_txt(conf.train_file, conf.train_num) if conf.typing_model: devs = reader.read_txt_with_extraction(conf.dev_file, conf.dev_extraction, conf.dev_num) tests = reader.read_txt_with_extraction(conf.test_file, conf.test_extraction, conf.test_num) else: devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: print('Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains + devs + tests) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains + devs + tests) if conf.typing_model: """ Building mapping, for example: {B-per: [B-per, B-org, B-misc], O: O, I-org: [I-per, I-org]} Will be used when creating the mask """ conf.typing_map = build_type_id_mapping(conf) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.static_context_emb != ContextEmb.none: print('Loading the static ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.static_context_emb.name + ".vec", trains) load_elmo_vec( conf.dev_file + "." + conf.static_context_emb.name + ".vec", devs) load_elmo_vec( conf.test_file + "." + conf.static_context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains + devs + tests) if conf.embedder_type == "normal": conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) print("[Data Info] num chars: " + str(conf.num_char)) # print(str(conf.char2idx)) print("[Data Info] num words: " + str(len(conf.word2idx))) # print(config.word2idx) else: """ If we use the pretrained model from transformers we need to use the pretrained tokenizer """ print( colored( f"[Data Info] Tokenizing the instances using '{conf.embedder_type}' tokenizer", "red")) tokenize_instance( context_models[conf.embedder_type]["tokenizer"].from_pretrained( conf.embedder_type), trains + devs + tests, conf.label2idx) train_model(conf, conf.num_epochs, trains, devs, tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) #set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) if conf.context_emb != ContextEmb.none: print('[Data Info] Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.use_iobes_gold(trains) conf.build_label_idx(trains + devs + tests) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(devs + tests) print("[Data Info] num chars: " + str(conf.num_char)) print("[Data Info] num words: " + str(len(conf.word2idx))) conf.map_insts_ids(trains) conf.get_gold_label_ids(trains) random.shuffle(trains) for inst in trains: inst.is_prediction = [False] * len(inst.input) for pos, label in enumerate(inst.output): if label == conf.O: inst.is_prediction[pos] = True num_insts_in_fold = math.ceil(len(trains) / conf.num_folds) trains = [ trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold] for i in range(conf.num_folds) ] train_model(config=conf, train_insts=trains, dev_insts=devs, test_insts=tests)
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) set_seed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num, conf.category) devs = reader.read_txt(conf.dev_file, conf.dev_num, conf.category) tests = reader.read_txt(conf.test_file, conf.test_num, conf.category) if conf.context_emb not in [ContextEmb.none, ContextEmb.mbert]: print('Loading the ELMo vectors for all datasets.') conf.context_emb_size = load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs) load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains) conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(trains + devs + tests) if conf.context_emb == ContextEmb.mbert: from tokenizers import BertWordPieceTokenizer conf.bert_path = f'data/{conf.dataset}/distilbert-base-uncased' tokenizer = BertWordPieceTokenizer(f'{conf.bert_path}/vocab.txt', lowercase=True) conf.map_tokens_ids(trains, tokenizer) conf.map_tokens_ids(devs, tokenizer) conf.map_tokens_ids(tests, tokenizer) else: conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) train_model(conf, conf.num_epochs, trains, devs, tests)