Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        print('Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec", devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec", tests)

    conf.build_label_idx(trains)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    conf.map_insts_ids(trains)
    conf.map_insts_ids(devs)
    conf.map_insts_ids(tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    train_model(conf, conf.num_epochs, trains, devs, tests)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    opt = parse_arguments(parser)
    conf = Config(opt)
    reader = Reader(conf.digit2zero)
    dataset, max_length, label_length = reader.read_trigger_txt(
        conf.trigger_file, -1)

    reader.merge_labels(dataset)

    trains = reader.read_txt(conf.train_all_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)
    print(len(dataset))
    if conf.context_emb == ContextEmb.bert:
        print('Loading the BERT vectors for all datasets.')
        conf.context_emb_size = load_bert_vec(
            conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

    # setting for data
    conf.use_iobes(trains)
    conf.use_iobes(dataset)
    conf.use_iobes(devs)
    conf.use_iobes(tests)

    conf.optimizer = opt.trig_optimizer
    conf.build_label_idx(dataset)
    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(dataset)
    conf.map_insts_ids(trains)
    conf.map_insts_ids(devs)
    conf.map_insts_ids(tests)

    dataset = reader.trigger_percentage(dataset, conf.percentage)
    encoder = SoftMatcher(conf, label_length)
    trainer = SoftMatcherTrainer(encoder, conf, devs, tests)

    # matching module training
    random.shuffle(dataset)
    trainer.train_model(conf.num_epochs_soft, dataset)
    logits, predicted, triggers = trainer.get_triggervec(dataset)
    # all the trigger vectors, trigger type, string name of the trigger
    triggers_remove = remove_duplicates(logits, predicted, triggers, dataset)

    numbers = int(len(trains) * (1 - opt.unlabeled_percentage))
    print("number of train instances : ", numbers)
    initial_trains = trains[:numbers]
    unlabeled_x = trains[numbers:]

    for data in unlabeled_x:
        data.output_ids = None

    # sequence labeling module self-training
    random.shuffle(dataset)
    inference = SoftSequence(conf, encoder)
    sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests,
                                           triggers_remove)
    sequence_trainer.self_training(conf.num_epochs, dataset, unlabeled_x)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        logging.info('[Data Info] Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    # conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains + devs + tests)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(devs + tests)
    logging.info("[Data Info] num chars: " + str(conf.num_char))
    logging.info("[Data Info] num words: " + str(len(conf.word2idx)))

    logging.info(
        f"[Data Info] Removing {conf.entity_keep_ratio*100}% of entities from the training set"
    )

    logging.info("[Data Info] Removing the entities")
    ## it will return the set of removed entities (for debug purpose)
    _ = remove_entites(trains, conf)
    # logging.info(f"entities removed: {span_set}")
    conf.map_insts_ids(trains)
    random.shuffle(trains)
    for inst in trains:
        inst.is_prediction = [False] * len(inst.input)
        if conf.variant == "soft":
            inst.marginals = np.full((len(inst.input), conf.label_size), -1e10)
        for pos, label in enumerate(inst.output):
            if label == conf.O:
                inst.is_prediction[pos] = True
            if conf.variant == "soft":
                inst.marginals[pos, conf.label2idx[label]] = 0

    num_insts_in_fold = math.ceil(len(trains) / conf.num_folds)
    trains = [
        trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold]
        for i in range(conf.num_folds)
    ]
    train_model(config=conf,
                train_insts=trains,
                dev_insts=devs,
                test_insts=tests)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    if "ontonotes" in conf.train_file:
        trains = reader.read_conll(conf.train_file, conf.train_num)
        devs = reader.read_conll(conf.dev_file, conf.dev_num)
        tests = reader.read_conll(conf.test_file, conf.test_num)
    else:
        trains = reader.read_txt(conf.train_file, conf.train_num)
        if conf.typing_model:
            devs = reader.read_txt_with_extraction(conf.dev_file,
                                                   conf.dev_extraction,
                                                   conf.dev_num)
            tests = reader.read_txt_with_extraction(conf.test_file,
                                                    conf.test_extraction,
                                                    conf.test_num)
        else:
            devs = reader.read_txt(conf.dev_file, conf.dev_num)
            tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        print('Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains + devs + tests)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    conf.map_insts_ids(trains + devs + tests)

    if conf.typing_model:
        """
        Building mapping, for example: {B-per: [B-per, B-org, B-misc], O: O, I-org: [I-per, I-org]}
        Will be used when creating the mask
        """
        conf.typing_map = build_type_id_mapping(conf)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    train_model(conf, conf.num_epochs, trains, devs, tests)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.static_context_emb != ContextEmb.none:
        print('Loading the static ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.static_context_emb.name + ".vec",
            trains)
        load_elmo_vec(
            conf.dev_file + "." + conf.static_context_emb.name + ".vec", devs)
        load_elmo_vec(
            conf.test_file + "." + conf.static_context_emb.name + ".vec",
            tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains + devs + tests)

    if conf.embedder_type == "normal":
        conf.build_word_idx(trains, devs, tests)
        conf.build_emb_table()

        conf.map_insts_ids(trains)
        conf.map_insts_ids(devs)
        conf.map_insts_ids(tests)
        print("[Data Info] num chars: " + str(conf.num_char))
        # print(str(conf.char2idx))
        print("[Data Info] num words: " + str(len(conf.word2idx)))
        # print(config.word2idx)
    else:
        """
        If we use the pretrained model from transformers
        we need to use the pretrained tokenizer
        """
        print(
            colored(
                f"[Data Info] Tokenizing the instances using '{conf.embedder_type}' tokenizer",
                "red"))
        tokenize_instance(
            context_models[conf.embedder_type]["tokenizer"].from_pretrained(
                conf.embedder_type), trains + devs + tests, conf.label2idx)

    train_model(conf, conf.num_epochs, trains, devs, tests)
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    #set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)

    if conf.context_emb != ContextEmb.none:
        print('[Data Info] Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)

        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    conf.use_iobes(trains + devs + tests)
    conf.use_iobes_gold(trains)
    conf.build_label_idx(trains + devs + tests)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(devs + tests)
    print("[Data Info] num chars: " + str(conf.num_char))
    print("[Data Info] num words: " + str(len(conf.word2idx)))

    conf.map_insts_ids(trains)
    conf.get_gold_label_ids(trains)
    random.shuffle(trains)

    for inst in trains:
        inst.is_prediction = [False] * len(inst.input)
        for pos, label in enumerate(inst.output):
            if label == conf.O:
                inst.is_prediction[pos] = True

    num_insts_in_fold = math.ceil(len(trains) / conf.num_folds)
    trains = [
        trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold]
        for i in range(conf.num_folds)
    ]
    train_model(config=conf,
                train_insts=trains,
                dev_insts=devs,
                test_insts=tests)
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num, conf.category)
    devs = reader.read_txt(conf.dev_file, conf.dev_num, conf.category)
    tests = reader.read_txt(conf.test_file, conf.test_num, conf.category)

    if conf.context_emb not in [ContextEmb.none, ContextEmb.mbert]:
        print('Loading the ELMo vectors for all datasets.')
        conf.context_emb_size = load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        load_elmo_vec(conf.dev_file + "." + conf.context_emb.name + ".vec",
                      devs)
        load_elmo_vec(conf.test_file + "." + conf.context_emb.name + ".vec",
                      tests)

    conf.use_iobes(trains)
    conf.use_iobes(devs)
    conf.use_iobes(tests)
    conf.build_label_idx(trains + devs + tests)

    if conf.context_emb == ContextEmb.mbert:
        from tokenizers import BertWordPieceTokenizer

        conf.bert_path = f'data/{conf.dataset}/distilbert-base-uncased'
        tokenizer = BertWordPieceTokenizer(f'{conf.bert_path}/vocab.txt',
                                           lowercase=True)
        conf.map_tokens_ids(trains, tokenizer)
        conf.map_tokens_ids(devs, tokenizer)
        conf.map_tokens_ids(tests, tokenizer)
    else:
        conf.build_word_idx(trains, devs, tests)
        conf.build_emb_table()

        conf.map_insts_ids(trains)
        conf.map_insts_ids(devs)
        conf.map_insts_ids(tests)

        print("num chars: " + str(conf.num_char))
        # print(str(config.char2idx))

        print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    train_model(conf, conf.num_epochs, trains, devs, tests)
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num, True)
    devs = reader.read_txt(conf.dev_file, conf.dev_num, False)
    tests = reader.read_txt(conf.test_file, conf.test_num, False)

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file + "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file + "." + conf.context_emb.name + ".vec", tests)
    conf.use_iobes(trains)
    conf.use_iobes(devs)
    conf.use_iobes(tests)
    conf.build_label_idx(trains)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains)
    ids_dev = conf.map_insts_ids(devs)
    ids_test = conf.map_insts_ids(tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)
Ejemplo n.º 9
0
def main():
    logging.info("Transformer implementation")
    parser = argparse.ArgumentParser(
        description="Transformer CRF implementation")
    opt = parse_arguments_t(parser)
    conf = Config(opt)
    conf.train_file = conf.dataset + "/train.txt"
    conf.dev_file = conf.dataset + "/valid.txt"
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.device_num
    # data reader
    reader = Reader(conf.digit2zero)
    set_seed(opt, conf.seed)

    # set logger
    utils.set_logger(os.path.join(conf.model_folder, 'train.log'))

    # params
    for k in opt.__dict__:
        logging.info(k + ": " + str(opt.__dict__[k]))

    # read trains/devs
    logging.info("\n")
    logging.info("Loading the datasets...")
    trains = reader.read_txt(conf.train_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)

    logging.info("Building label idx ...")
    # build label2idx and idx2label
    conf.build_label_idx(trains + devs)

    random.shuffle(trains)
    # set the prediction flag, if is_prediction is False, we will not update this label.
    for inst in trains:
        inst.is_prediction = [False] * len(inst.input)
        for pos, label in enumerate(inst.output):
            if label == conf.O:
                inst.is_prediction[pos] = True
    # dividing the data into 2 parts(num_folds default to 2)
    num_insts_in_fold = math.ceil(len(trains) / conf.num_folds)
    trains = [
        trains[i * num_insts_in_fold:(i + 1) * num_insts_in_fold]
        for i in range(conf.num_folds)
    ]

    train_model(config=conf, train_insts=trains, dev_insts=devs)
Ejemplo n.º 10
0
    return args


parser = argparse.ArgumentParser()
opt = parse_arguments(parser)
conf = Config(opt)
reader = Reader(conf.digit2zero)

# train_file = 'data/annotated_ner_data/StackOverflow/train.txt'
# dev_file = 'data/annotated_ner_data/StackOverflow/dev.txt'
# test_file = 'data/annotated_ner_data/StackOverflow/test.txt'
# dataset = reader.read_txt(train_file, -1)
# devs = reader.read_txt(dev_file, -1)
# tests = reader.read_txt(test_file, -1)

dataset = reader.read_txt(conf.train_all_file, -1)
# devs = reader.read_txt(conf.dev_file, -1)
tests = reader.read_txt(conf.test_file, -1)
print(len(dataset))

# setting for data
conf.use_iobes(dataset)
# conf.use_iobes(devs)
conf.use_iobes(tests)

conf.build_label_idx(dataset)
conf.build_word_idx(dataset, None, tests)
conf.build_emb_table()

conf.map_insts_ids(dataset)
# conf.map_insts_ids(devs)
Ejemplo n.º 11
0
    args = parser.parse_args()
    for k in args.__dict__:
        print(k + ": " + str(args.__dict__[k]))
    return args


parser = argparse.ArgumentParser()
opt = parse_arguments(parser)
conf = Config(opt)
reader = Reader(conf.digit2zero)
dataset, max_length, label_length = reader.read_trigger_txt(
    conf.trigger_file, -1)
reader.merge_labels(dataset)

devs = reader.read_txt(conf.dev_file, conf.dev_num)
tests = reader.read_txt(conf.test_file, conf.test_num)
print(len(dataset))
if conf.context_emb == ContextEmb.bert:
    print('Loading the BERT vectors for all datasets.')
    conf.context_emb_size = load_bert_vec(
        conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

# setting for data
conf.use_iobes(dataset)
conf.use_iobes(devs)
conf.use_iobes(tests)

conf.optimizer = opt.trig_optimizer
conf.build_label_idx(dataset)
conf.build_word_idx(dataset, devs, tests)
Ejemplo n.º 12
0
    parser.add_argument('--percentage',
                        type=int,
                        default=100,
                        help="how much percentage of training dataset to use")

    args = parser.parse_args()
    for k in args.__dict__:
        print(k + ": " + str(args.__dict__[k]))
    return args


parser = argparse.ArgumentParser()
opt = parse_arguments(parser)
conf = Config(opt)
reader = Reader(conf.digit2zero)
dataset = reader.read_txt(conf.train_file, conf.dev_num)
devs = reader.read_txt(conf.dev_file, conf.dev_num)
tests = reader.read_txt(conf.test_file, conf.test_num)
print(len(dataset))
if conf.context_emb == ContextEmb.bert:
    print('Loading the BERT vectors for all datasets.')
    conf.context_emb_size = load_bert_vec(
        conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

# setting for data
conf.use_iobes(dataset)
conf.use_iobes(devs)
conf.use_iobes(tests)

conf.optimizer = opt.trig_optimizer
conf.build_label_idx(dataset)