def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None: """ Read the input files and write the vectors to the output files :param elmo: ELMo embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) if batch_size < 1: # Not using batch for sent in tqdm(all_sents, desc="Elmo Embedding"): elmo_vecs = elmo.embed_sentence(sent) vec = parse_sentence(elmo_vecs, mode=mode) all_vecs.append(vec) else: # Batched prediction for elmo_vecs in tqdm(elmo.embed_sentences(all_sents, batch_size=batch_size), desc="Elmo Embedding", total=len(all_sents)): vec = parse_sentence(elmo_vecs, mode=mode) all_vecs.append(vec) print("Finishing embedding ELMo sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def read_parse_write(tokenizer: BertTokenizer, bert_client: BertClient, infile: str, outfile: str, mode) -> None: """ Read the input files and write the vectors to the output files :param bert_client: BertClient :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of bert word piece :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) for sent in tqdm(all_sents, desc="BERT encoding"): word_piece_tokens, word_to_piece_index = bert_tokenize_words(tokenizer, sent, mode=mode) bert_vec = np.squeeze(bert_client.encode([word_piece_tokens], is_tokenized=True), axis=0)[1:-1, :] ## exclude the [CLS] and [SEP] bert_vec = bert_vec[word_to_piece_index, :] print(bert_vec.shape) all_vecs.append(bert_vec) print("Finishing embedding BERT sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def main(): parser = argparse.ArgumentParser( description="Dependency-Guided LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_conll(conf.train_file, -1, True) devs = reader.read_conll(conf.dev_file, conf.dev_num, False) tests = reader.read_conll(conf.test_file, conf.test_num, False) if conf.context_emb != ContextEmb.none: print('Loading the {} vectors for all datasets.'.format( conf.context_emb.name)) conf.context_emb_size = reader.load_elmo_vec( conf.train_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains) conf.build_deplabel_idx(trains + devs + tests) print("# deplabels: ", len(conf.deplabels)) print("dep label 2idx: ", conf.deplabel2idx) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains + devs + tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": if conf.train_num != -1: random.shuffle(trains) trains = trains[:conf.train_num] learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)
def read_parse_write(elmo, infile, outfile, mode): reader = Reader() insts = reader.read_conll(infile, -1, True) f = open(outfile, 'wb') all_vecs = [] for inst in insts: vec = parse_sentence(elmo, inst.input.words, mode=mode) all_vecs.append(vec) pickle.dump(all_vecs, f) f.close()
def read_parse_write(elmo, infile, outfile,): reader = Reader() insts = reader.read_conll(infile, -1, True) f = open(outfile, 'wb') all_vecs = [] for inst in insts: sent = embed_sent(elmo, inst.input.words) # np.empty((len(sent)),dtype=np.float32) arr = [] for token in sent: # print(token) # print(token.embedding) arr.append(np.expand_dims(token.embedding.numpy(), axis=0)) # all_vecs.append(vec) all_vecs.append(np.concatenate(arr)) pickle.dump(all_vecs, f) f.close()
def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average") -> None: """ Read the input files and write the vectors to the output files :param elmo: ELMo embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] for inst in tqdm(insts): vec = parse_sentence(elmo, inst.input.words, mode=mode)#Remove pos_tags argument for model without additional embeedding for materials all_vecs.append(vec) print("Finishing embedding ELMo sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def read_parse_write(bert: DistilBertModel, bert_path: str, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None: """ Read the input files and write the vectors to the output files :param bert: Bert embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) dataset = CustomDataset(all_sents, bert_path) batch_size = max(1, batch_size) # make sure batch_size is gt 0 dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4) for _, (batch, n_pads) in tqdm(enumerate(dataloader)): with torch.no_grad(): batch = batch.cuda() if CUDA else batch bert = bert.cuda() if CUDA else bert bert_batch_vecs = bert(batch)[0].cpu().numpy() vectors = parse_sentence(bert_batch_vecs, mode=mode) for j in range(vectors.shape[0]): all_vecs.append(vectors[j, :-n_pads[j], :]) print("Finishing embedding Bert sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def main(): parser = argparse.ArgumentParser( description="Dependency-Guided LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_conll(conf.train_file, -1, True) devs = reader.read_conll(conf.dev_file, conf.dev_num, False) tests = reader.read_conll(conf.test_file, conf.test_num, False) conf.use_iobes(trains) conf.build_label_idx(trains) conf.build_deplabel_idx(trains + devs + tests) print("# deplabels: ", len(conf.deplabels)) print("dep label 2idx: ", conf.deplabel2idx) conf.build_word_idx(trains + devs + tests) conf.build_emb_table() conf.map_insts_ids(trains) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": if conf.train_num != -1: random.shuffle(trains) trains = trains[:conf.train_num] learn_from_insts(conf, conf.num_epochs, trains) print(opt.mode)
def main(): TASKS = ['ner_german', 'ner'] USE_DEV = True char_set = set() for task in TASKS: t = __import__(task) data_list = [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA] char_index, _ = t.create_char_index(data_list) for k, v in char_index.items(): char_set.add(k) char_index, char_cnt = {}, 0 for char in char_set: char_index[char] = char_cnt char_cnt += 1 for i, task in enumerate(TASKS): t = __import__(task) word_index, word_cnt = t.create_word_index( [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA]) wx, y, m = t.read_data(t.TRAIN_DATA, word_index) if USE_DEV and task == 'ner': dev_wx, dev_y, dev_m = t.read_data(t.TEST_DATA, word_index) wx, y, m = np.vstack((wx, dev_wx)), np.vstack( (y, dev_y)), np.vstack((m, dev_m)) twx, ty, tm = t.read_data(t.DEV_DATA, word_index) x, cm = t.read_char_data(t.TRAIN_DATA, char_index) if USE_DEV and task == 'ner': dev_x, dev_cm = t.read_char_data(t.TEST_DATA, char_index) x, cm = np.vstack((x, dev_x)), np.vstack((cm, dev_cm)) tx, tcm = t.read_char_data(t.DEV_DATA, char_index) if task == 'ner': list_prefix = t.read_list() gaze = t.read_list_data(t.TRAIN_DATA, list_prefix) tgaze = t.read_list_data(t.DEV_DATA, list_prefix) if USE_DEV: dev_gaze = t.read_list_data(t.TEST_DATA, list_prefix) gaze = np.vstack((gaze, dev_gaze)) else: gaze, tgaze = None, None parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num, True) devs = reader.read_txt(conf.dev_file, conf.dev_num, False) tests = reader.read_txt(conf.test_file, conf.test_num, False) trains_target = reader.read_txt(conf.train_target_file_file, conf.train_num, True) if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains) conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(trains) conf.use_iobes(trains_target) conf.build_label_idx_target(trains_target) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() ids_train = conf.map_insts_ids(trains) ids_dev = conf.map_insts_ids(devs) ids_test = conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)
# # @author: Allan # from config.reader import Reader file = "data/ontonotes/train.sd.conllx" digit2zero = False reader = Reader(digit2zero) insts = reader.read_conll(file, -1, True) # devs = reader.read_conll(conf.dev_file, conf.dev_num, False) # tests = reader.read_conll(conf.test_file, conf.test_num, False) out_dep_label2num = {} out_doubledep2num = {} out_word2num = {} label2idx = {} def not_entity(label:str): if label.startswith("B-") or label.startswith("I-"): return False return True def is_entity(label:str): if label.startswith("B-") or label.startswith("I-"): return True return False
def main(): print('Reading arguments') parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) conf_conll = Config_conll(opt) conf_ontonotes = Config_ontonotes(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains_0 = reader.read_conll(conf.train_file_1, 0, conf.train_num, True) devs_0 = reader.read_conll(conf.dev_file_1, 0, conf.dev_num, False) tests_0 = reader.read_conll(conf.test_file_1, 0, conf.test_num, False) trains_1 = reader.read_conll(conf.train_file_2, 1, conf.train_num, True) devs_1 = reader.read_conll(conf.dev_file_2, 1, conf.dev_num, False) tests_1 = reader.read_conll(conf.test_file_2, 1, conf.test_num, False) trains_all = trains_0 + trains_1 devs_all = devs_0 + devs_1 tests_all = tests_0 + tests_1 if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file_1 + "." + conf.context_emb.name + ".vec", trains_1) reader.load_elmo_vec( conf.dev_file_1 + "." + conf.context_emb.name + ".vec", devs_1) reader.load_elmo_vec( conf.test_file_1 + "." + conf.context_emb.name + ".vec", tests_1) conf.use_iobes(trains_all) conf.use_iobes(devs_all) conf.use_iobes(tests_all) conf.build_label_idx(trains_all) conf.build_word_idx(trains_all, devs_all, tests_all) conf.build_emb_table() ids_train = conf.map_insts_ids(trains_all) ids_dev = conf.map_insts_ids(devs_all) ids_test = conf.map_insts_ids(tests_all) conf_conll.label_size = conf.label_size_0 conf_conll.label2idx = conf.label2idx_0 conf_conll.idx2labels = conf.idx2labels_0 conf_ontonotes.label_size = conf.label_size_1 conf_ontonotes.label2idx = conf.label2idx_1 conf_ontonotes.idx2labels = conf.idx2labels_1 print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf_conll, conf_ontonotes, conf.num_epochs, trains_all, devs_all, tests_all) else: ## Load the trained model. test_model(conf, tests_all) # pass print(opt.mode)
else: next_entity = output[pos + 1] if curr_entity.startswith("B-"): if next_entity.startswith("O") or next_entity.startswith("B-"): output[pos] = curr_entity.replace("B-", "S-") elif curr_entity.startswith("I-"): if next_entity.startswith("O") or next_entity.startswith("B-"): output[pos] = curr_entity.replace("I-", "E-") dataset = "ontonotes_chinese" train = "../data/"+dataset+"/train.sd.conllx" dev = "../data/"+dataset+"/dev.sd.conllx" test = "../data/"+dataset+"/test.sd.conllx" digit2zero = False reader = Reader(digit2zero) insts = reader.read_conll(train, -1, True) insts += reader.read_conll(dev, -1, False) insts += reader.read_conll(test, -1, False) use_iobes(insts) L = 3 def get_spans(output): output_spans = set() start = -1 for i in range(len(output)): if output[i].startswith("B-"): start = i if output[i].startswith("E-"):
def nonterms(self): return list(self.nonterms_iter()) def __eq__(self, other): return other and self.pos == other.pos and self.children == other.children def __hash__(self): return hash((self.pos, self.children)) if __name__ == "__main__": '''###read the tree ''' from config.reader import Reader reader = Reader() insts = reader.read_conll("../data/abc/train.conllx", number=1) for inst in insts: nodes = [Tree(pos) for pos in range(len(inst.input.words))] root = Tree(-1) for pos, head in enumerate(inst.input.heads): if head != -1: nodes[head].add_child(nodes[pos]) else: root.add_child(nodes[pos]) inst.nodes = nodes for node in nodes: node.sort_children() print(root.leaves()) for pos, node in enumerate(nodes):