def main(): parser = argparse.ArgumentParser( description="Dependency-Guided LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_conll(conf.train_file, -1, True) devs = reader.read_conll(conf.dev_file, conf.dev_num, False) tests = reader.read_conll(conf.test_file, conf.test_num, False) if conf.context_emb != ContextEmb.none: print('Loading the {} vectors for all datasets.'.format( conf.context_emb.name)) conf.context_emb_size = reader.load_elmo_vec( conf.train_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains) conf.build_deplabel_idx(trains + devs + tests) print("# deplabels: ", len(conf.deplabels)) print("dep label 2idx: ", conf.deplabel2idx) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains + devs + tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": if conf.train_num != -1: random.shuffle(trains) trains = trains[:conf.train_num] learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)
def main(): TASKS = ['ner_german', 'ner'] USE_DEV = True char_set = set() for task in TASKS: t = __import__(task) data_list = [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA] char_index, _ = t.create_char_index(data_list) for k, v in char_index.items(): char_set.add(k) char_index, char_cnt = {}, 0 for char in char_set: char_index[char] = char_cnt char_cnt += 1 for i, task in enumerate(TASKS): t = __import__(task) word_index, word_cnt = t.create_word_index( [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA]) wx, y, m = t.read_data(t.TRAIN_DATA, word_index) if USE_DEV and task == 'ner': dev_wx, dev_y, dev_m = t.read_data(t.TEST_DATA, word_index) wx, y, m = np.vstack((wx, dev_wx)), np.vstack( (y, dev_y)), np.vstack((m, dev_m)) twx, ty, tm = t.read_data(t.DEV_DATA, word_index) x, cm = t.read_char_data(t.TRAIN_DATA, char_index) if USE_DEV and task == 'ner': dev_x, dev_cm = t.read_char_data(t.TEST_DATA, char_index) x, cm = np.vstack((x, dev_x)), np.vstack((cm, dev_cm)) tx, tcm = t.read_char_data(t.DEV_DATA, char_index) if task == 'ner': list_prefix = t.read_list() gaze = t.read_list_data(t.TRAIN_DATA, list_prefix) tgaze = t.read_list_data(t.DEV_DATA, list_prefix) if USE_DEV: dev_gaze = t.read_list_data(t.TEST_DATA, list_prefix) gaze = np.vstack((gaze, dev_gaze)) else: gaze, tgaze = None, None parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num, True) devs = reader.read_txt(conf.dev_file, conf.dev_num, False) tests = reader.read_txt(conf.test_file, conf.test_num, False) trains_target = reader.read_txt(conf.train_target_file_file, conf.train_num, True) if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains) conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(trains) conf.use_iobes(trains_target) conf.build_label_idx_target(trains_target) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() ids_train = conf.map_insts_ids(trains) ids_dev = conf.map_insts_ids(devs) ids_test = conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)
def main(): print('Reading arguments') parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) conf_conll = Config_conll(opt) conf_ontonotes = Config_ontonotes(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains_0 = reader.read_conll(conf.train_file_1, 0, conf.train_num, True) devs_0 = reader.read_conll(conf.dev_file_1, 0, conf.dev_num, False) tests_0 = reader.read_conll(conf.test_file_1, 0, conf.test_num, False) trains_1 = reader.read_conll(conf.train_file_2, 1, conf.train_num, True) devs_1 = reader.read_conll(conf.dev_file_2, 1, conf.dev_num, False) tests_1 = reader.read_conll(conf.test_file_2, 1, conf.test_num, False) trains_all = trains_0 + trains_1 devs_all = devs_0 + devs_1 tests_all = tests_0 + tests_1 if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file_1 + "." + conf.context_emb.name + ".vec", trains_1) reader.load_elmo_vec( conf.dev_file_1 + "." + conf.context_emb.name + ".vec", devs_1) reader.load_elmo_vec( conf.test_file_1 + "." + conf.context_emb.name + ".vec", tests_1) conf.use_iobes(trains_all) conf.use_iobes(devs_all) conf.use_iobes(tests_all) conf.build_label_idx(trains_all) conf.build_word_idx(trains_all, devs_all, tests_all) conf.build_emb_table() ids_train = conf.map_insts_ids(trains_all) ids_dev = conf.map_insts_ids(devs_all) ids_test = conf.map_insts_ids(tests_all) conf_conll.label_size = conf.label_size_0 conf_conll.label2idx = conf.label2idx_0 conf_conll.idx2labels = conf.idx2labels_0 conf_ontonotes.label_size = conf.label_size_1 conf_ontonotes.label2idx = conf.label2idx_1 conf_ontonotes.idx2labels = conf.idx2labels_1 print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf_conll, conf_ontonotes, conf.num_epochs, trains_all, devs_all, tests_all) else: ## Load the trained model. test_model(conf, tests_all) # pass print(opt.mode)