def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        print('Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests)

    conf.build_label_idx(trains)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    conf.map_insts_ids(trains)
    conf.map_insts_ids(devs)
    conf.map_insts_ids(tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    train_model(conf, conf.num_epochs, trains, devs, tests)
Example #2
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        logging.info('[Data Info] Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    # conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains + devs + tests)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(devs + tests)
    logging.info("[Data Info] num chars: " + str(conf.num_char))
    logging.info("[Data Info] num words: " + str(len(conf.word2idx)))

    logging.info(
        f"[Data Info] Removing {conf.entity_keep_ratio*100}% of entities from the training set"
    )

    logging.info("[Data Info] Removing the entities")
    ## it will return the set of removed entities (for debug purpose)
    _ = remove_entites(trains, conf)
    # logging.info(f"entities removed: {span_set}")
    conf.map_insts_ids(trains)
    random.shuffle(trains)
    for inst in trains:
        inst.is_prediction = [False] * len(inst.input)
        if conf.variant == "soft":
            inst.marginals = np.full((len(inst.input), conf.label_size), -1e10)
        for pos, label in enumerate(inst.output):
            if label == conf.O:
                inst.is_prediction[pos] = True
            if conf.variant == "soft":
                inst.marginals[pos, conf.label2idx[label]] = 0

    num_insts_in_fold = math.ceil(len(trains) / conf.num_folds)
    trains = [
        trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold]
        for i in range(conf.num_folds)
    ]
    train_model(config=conf,
                train_insts=trains,
                dev_insts=devs,
                test_insts=tests)
Example #3
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    if "ontonotes" in conf.train_file:
        trains = reader.read_conll(conf.train_file, conf.train_num)
        devs = reader.read_conll(conf.dev_file, conf.dev_num)
        tests = reader.read_conll(conf.test_file, conf.test_num)
    else:
        trains = reader.read_txt(conf.train_file, conf.train_num)
        if conf.typing_model:
            devs = reader.read_txt_with_extraction(conf.dev_file,
                                                   conf.dev_extraction,
                                                   conf.dev_num)
            tests = reader.read_txt_with_extraction(conf.test_file,
                                                    conf.test_extraction,
                                                    conf.test_num)
        else:
            devs = reader.read_txt(conf.dev_file, conf.dev_num)
            tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        print('Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains + devs + tests)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    conf.map_insts_ids(trains + devs + tests)

    if conf.typing_model:
        """
        Building mapping, for example: {B-per: [B-per, B-org, B-misc], O: O, I-org: [I-per, I-org]}
        Will be used when creating the mask
        """
        conf.typing_map = build_type_id_mapping(conf)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    train_model(conf, conf.num_epochs, trains, devs, tests)
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.static_context_emb != ContextEmb.none:
        print('Loading the static ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.static_context_emb.name + ".vec",
            trains)
        load_elmo_vec(
            conf.dev_file + "." + conf.static_context_emb.name + ".vec", devs)
        load_elmo_vec(
            conf.test_file + "." + conf.static_context_emb.name + ".vec",
            tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains + devs + tests)

    if conf.embedder_type == "normal":
        conf.build_word_idx(trains, devs, tests)
        conf.build_emb_table()

        conf.map_insts_ids(trains)
        conf.map_insts_ids(devs)
        conf.map_insts_ids(tests)
        print("[Data Info] num chars: " + str(conf.num_char))
        # print(str(conf.char2idx))
        print("[Data Info] num words: " + str(len(conf.word2idx)))
        # print(config.word2idx)
    else:
        """
        If we use the pretrained model from transformers
        we need to use the pretrained tokenizer
        """
        print(
            colored(
                f"[Data Info] Tokenizing the instances using '{conf.embedder_type}' tokenizer",
                "red"))
        tokenize_instance(
            context_models[conf.embedder_type]["tokenizer"].from_pretrained(
                conf.embedder_type), trains + devs + tests, conf.label2idx)

    train_model(conf, conf.num_epochs, trains, devs, tests)
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    #set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        print('[Data Info] Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)

        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    conf.use_iobes(trains + devs + tests)
    conf.use_iobes_gold(trains)
    conf.build_label_idx(trains + devs + tests)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(devs + tests)
    print("[Data Info] num chars: " + str(conf.num_char))
    print("[Data Info] num words: " + str(len(conf.word2idx)))

    conf.map_insts_ids(trains)
    conf.get_gold_label_ids(trains)
    random.shuffle(trains)

    for inst in trains:
        inst.is_prediction = [False] * len(inst.input)
        for pos, label in enumerate(inst.output):
            if label == conf.O:
                inst.is_prediction[pos] = True

    num_insts_in_fold = math.ceil(len(trains) / conf.num_folds)
    trains = [
        trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold]
        for i in range(conf.num_folds)
    ]
    train_model(config=conf,
                train_insts=trains,
                dev_insts=devs,
                test_insts=tests)
Example #6
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num, conf.category)
    devs = reader.read_txt(conf.dev_file, conf.dev_num, conf.category)
    tests = reader.read_txt(conf.test_file, conf.test_num, conf.category)

    if conf.context_emb not in [ContextEmb.none, ContextEmb.mbert]:
        print('Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    conf.use_iobes(trains)
    conf.use_iobes(devs)
    conf.use_iobes(tests)
    conf.build_label_idx(trains + devs + tests)

    if conf.context_emb == ContextEmb.mbert:
        from tokenizers import BertWordPieceTokenizer

        conf.bert_path = f'data/{conf.dataset}/distilbert-base-uncased'
        tokenizer = BertWordPieceTokenizer(f'{conf.bert_path}/vocab.txt',
                                           lowercase=True)
        conf.map_tokens_ids(trains, tokenizer)
        conf.map_tokens_ids(devs, tokenizer)
        conf.map_tokens_ids(tests, tokenizer)
    else:
        conf.build_word_idx(trains, devs, tests)
        conf.build_emb_table()

        conf.map_insts_ids(trains)
        conf.map_insts_ids(devs)
        conf.map_insts_ids(tests)

        print("num chars: " + str(conf.num_char))
        # print(str(config.char2idx))

        print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    train_model(conf, conf.num_epochs, trains, devs, tests)