Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        description="Dependency-Guided LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_conll(conf.train_file, -1, True)
    devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
    tests = reader.read_conll(conf.test_file, conf.test_num, False)

    if conf.context_emb != ContextEmb.none:
        print('Loading the {} vectors for all datasets.'.format(
            conf.context_emb.name))
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains)

    conf.build_deplabel_idx(trains + devs + tests)
    print("# deplabels: ", len(conf.deplabels))
    print("dep label 2idx: ", conf.deplabel2idx)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(trains + devs + tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        if conf.train_num != -1:
            random.shuffle(trains)
            trains = trains[:conf.train_num]
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)
Esempio n. 2
0
def main():
    TASKS = ['ner_german', 'ner']
    USE_DEV = True

    char_set = set()
    for task in TASKS:

        t = __import__(task)
        data_list = [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA]
        char_index, _ = t.create_char_index(data_list)
        for k, v in char_index.items():
            char_set.add(k)
    char_index, char_cnt = {}, 0
    for char in char_set:
        char_index[char] = char_cnt
        char_cnt += 1

    for i, task in enumerate(TASKS):
        t = __import__(task)
        word_index, word_cnt = t.create_word_index(
            [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA])
        wx, y, m = t.read_data(t.TRAIN_DATA, word_index)
        if USE_DEV and task == 'ner':
            dev_wx, dev_y, dev_m = t.read_data(t.TEST_DATA, word_index)
            wx, y, m = np.vstack((wx, dev_wx)), np.vstack(
                (y, dev_y)), np.vstack((m, dev_m))
        twx, ty, tm = t.read_data(t.DEV_DATA, word_index)
        x, cm = t.read_char_data(t.TRAIN_DATA, char_index)
        if USE_DEV and task == 'ner':
            dev_x, dev_cm = t.read_char_data(t.TEST_DATA, char_index)
            x, cm = np.vstack((x, dev_x)), np.vstack((cm, dev_cm))
        tx, tcm = t.read_char_data(t.DEV_DATA, char_index)
        if task == 'ner':
            list_prefix = t.read_list()
            gaze = t.read_list_data(t.TRAIN_DATA, list_prefix)
            tgaze = t.read_list_data(t.DEV_DATA, list_prefix)
            if USE_DEV:
                dev_gaze = t.read_list_data(t.TEST_DATA, list_prefix)
                gaze = np.vstack((gaze, dev_gaze))
        else:
            gaze, tgaze = None, None

    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num, True)
    devs = reader.read_txt(conf.dev_file, conf.dev_num, False)
    tests = reader.read_txt(conf.test_file, conf.test_num, False)
    trains_target = reader.read_txt(conf.train_target_file_file,
                                    conf.train_num, True)

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file + "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file + "." + conf.context_emb.name + ".vec", tests)
    conf.use_iobes(trains)
    conf.use_iobes(devs)
    conf.use_iobes(tests)
    conf.build_label_idx(trains)
    conf.use_iobes(trains_target)
    conf.build_label_idx_target(trains_target)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains)
    ids_dev = conf.map_insts_ids(devs)
    ids_test = conf.map_insts_ids(tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)
Esempio n. 3
0
def main():
    print('Reading arguments')
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)
    conf_conll = Config_conll(opt)
    conf_ontonotes = Config_ontonotes(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains_0 = reader.read_conll(conf.train_file_1, 0, conf.train_num, True)
    devs_0 = reader.read_conll(conf.dev_file_1, 0, conf.dev_num, False)
    tests_0 = reader.read_conll(conf.test_file_1, 0, conf.test_num, False)

    trains_1 = reader.read_conll(conf.train_file_2, 1, conf.train_num, True)
    devs_1 = reader.read_conll(conf.dev_file_2, 1, conf.dev_num, False)
    tests_1 = reader.read_conll(conf.test_file_2, 1, conf.test_num, False)

    trains_all = trains_0 + trains_1
    devs_all = devs_0 + devs_1
    tests_all = tests_0 + tests_1

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file_1 + "." + conf.context_emb.name + ".vec", trains_1)
        reader.load_elmo_vec(
            conf.dev_file_1 + "." + conf.context_emb.name + ".vec", devs_1)
        reader.load_elmo_vec(
            conf.test_file_1 + "." + conf.context_emb.name + ".vec", tests_1)

    conf.use_iobes(trains_all)
    conf.use_iobes(devs_all)
    conf.use_iobes(tests_all)
    conf.build_label_idx(trains_all)

    conf.build_word_idx(trains_all, devs_all, tests_all)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains_all)
    ids_dev = conf.map_insts_ids(devs_all)
    ids_test = conf.map_insts_ids(tests_all)

    conf_conll.label_size = conf.label_size_0
    conf_conll.label2idx = conf.label2idx_0
    conf_conll.idx2labels = conf.idx2labels_0
    conf_ontonotes.label_size = conf.label_size_1
    conf_ontonotes.label2idx = conf.label2idx_1
    conf_ontonotes.idx2labels = conf.idx2labels_1

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf_conll, conf_ontonotes, conf.num_epochs,
                         trains_all, devs_all, tests_all)
    else:
        ## Load the trained model.
        test_model(conf, tests_all)
        # pass

    print(opt.mode)