Example #1
0
def main():
    parser = argparse.ArgumentParser()
    opt = parse_arguments(parser)
    conf = Config(opt)
    reader = Reader(conf.digit2zero)
    dataset, max_length, label_length = reader.read_trigger_txt(
        conf.trigger_file, -1)

    reader.merge_labels(dataset)

    trains = reader.read_txt(conf.train_all_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)
    print(len(dataset))
    if conf.context_emb == ContextEmb.bert:
        print('Loading the BERT vectors for all datasets.')
        conf.context_emb_size = load_bert_vec(
            conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

    # setting for data
    conf.use_iobes(trains)
    conf.use_iobes(dataset)
    conf.use_iobes(devs)
    conf.use_iobes(tests)

    conf.optimizer = opt.trig_optimizer
    conf.build_label_idx(dataset)
    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(dataset)
    conf.map_insts_ids(trains)
    conf.map_insts_ids(devs)
    conf.map_insts_ids(tests)

    dataset = reader.trigger_percentage(dataset, conf.percentage)
    encoder = SoftMatcher(conf, label_length)
    trainer = SoftMatcherTrainer(encoder, conf, devs, tests)

    # matching module training
    random.shuffle(dataset)
    trainer.train_model(conf.num_epochs_soft, dataset)
    logits, predicted, triggers = trainer.get_triggervec(dataset)
    # all the trigger vectors, trigger type, string name of the trigger
    triggers_remove = remove_duplicates(logits, predicted, triggers, dataset)

    numbers = int(len(trains) * (1 - opt.unlabeled_percentage))
    print("number of train instances : ", numbers)
    initial_trains = trains[:numbers]
    unlabeled_x = trains[numbers:]

    for data in unlabeled_x:
        data.output_ids = None

    # sequence labeling module self-training
    random.shuffle(dataset)
    inference = SoftSequence(conf, encoder)
    sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests,
                                           triggers_remove)
    sequence_trainer.self_training(conf.num_epochs, dataset, unlabeled_x)
Example #2
0
                        default=100,
                        help="how much percentage of training dataset to use")

    args = parser.parse_args()
    for k in args.__dict__:
        print(k + ": " + str(args.__dict__[k]))
    return args


parser = argparse.ArgumentParser()
opt = parse_arguments(parser)
conf = Config(opt)
reader = Reader(conf.digit2zero)
dataset, max_length, label_length = reader.read_trigger_txt(
    conf.trigger_file, -1)
reader.merge_labels(dataset)

devs = reader.read_txt(conf.dev_file, conf.dev_num)
tests = reader.read_txt(conf.test_file, conf.test_num)
print(len(dataset))
if conf.context_emb == ContextEmb.bert:
    print('Loading the BERT vectors for all datasets.')
    conf.context_emb_size = load_bert_vec(
        conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

# setting for data
conf.use_iobes(dataset)
conf.use_iobes(devs)
conf.use_iobes(tests)

conf.optimizer = opt.trig_optimizer