def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None: """ Read the input files and write the vectors to the output files :param elmo: ELMo embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) if batch_size < 1: # Not using batch for sent in tqdm(all_sents, desc="Elmo Embedding"): elmo_vecs = elmo.embed_sentence(sent) vec = parse_sentence(elmo_vecs, mode=mode) all_vecs.append(vec) else: # Batched prediction for elmo_vecs in tqdm(elmo.embed_sentences(all_sents, batch_size=batch_size), desc="Elmo Embedding", total=len(all_sents)): vec = parse_sentence(elmo_vecs, mode=mode) all_vecs.append(vec) print("Finishing embedding ELMo sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def read_parse_write(tokenizer: BertTokenizer, bert_client: BertClient, infile: str, outfile: str, mode) -> None: """ Read the input files and write the vectors to the output files :param bert_client: BertClient :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of bert word piece :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) for sent in tqdm(all_sents, desc="BERT encoding"): word_piece_tokens, word_to_piece_index = bert_tokenize_words(tokenizer, sent, mode=mode) bert_vec = np.squeeze(bert_client.encode([word_piece_tokens], is_tokenized=True), axis=0)[1:-1, :] ## exclude the [CLS] and [SEP] bert_vec = bert_vec[word_to_piece_index, :] print(bert_vec.shape) all_vecs.append(bert_vec) print("Finishing embedding BERT sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average") -> None: """ Read the input files and write the vectors to the output files :param elmo: ELMo embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] for inst in tqdm(insts): vec = parse_sentence(elmo, inst.input.words, mode=mode)#Remove pos_tags argument for model without additional embeedding for materials all_vecs.append(vec) print("Finishing embedding ELMo sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def read_parse_write(bert: DistilBertModel, bert_path: str, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None: """ Read the input files and write the vectors to the output files :param bert: Bert embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) dataset = CustomDataset(all_sents, bert_path) batch_size = max(1, batch_size) # make sure batch_size is gt 0 dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4) for _, (batch, n_pads) in tqdm(enumerate(dataloader)): with torch.no_grad(): batch = batch.cuda() if CUDA else batch bert = bert.cuda() if CUDA else bert bert_batch_vecs = bert(batch)[0].cpu().numpy() vectors = parse_sentence(bert_batch_vecs, mode=mode) for j in range(vectors.shape[0]): all_vecs.append(vectors[j, :-n_pads[j], :]) print("Finishing embedding Bert sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def main(): TASKS = ['ner_german', 'ner'] USE_DEV = True char_set = set() for task in TASKS: t = __import__(task) data_list = [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA] char_index, _ = t.create_char_index(data_list) for k, v in char_index.items(): char_set.add(k) char_index, char_cnt = {}, 0 for char in char_set: char_index[char] = char_cnt char_cnt += 1 for i, task in enumerate(TASKS): t = __import__(task) word_index, word_cnt = t.create_word_index( [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA]) wx, y, m = t.read_data(t.TRAIN_DATA, word_index) if USE_DEV and task == 'ner': dev_wx, dev_y, dev_m = t.read_data(t.TEST_DATA, word_index) wx, y, m = np.vstack((wx, dev_wx)), np.vstack( (y, dev_y)), np.vstack((m, dev_m)) twx, ty, tm = t.read_data(t.DEV_DATA, word_index) x, cm = t.read_char_data(t.TRAIN_DATA, char_index) if USE_DEV and task == 'ner': dev_x, dev_cm = t.read_char_data(t.TEST_DATA, char_index) x, cm = np.vstack((x, dev_x)), np.vstack((cm, dev_cm)) tx, tcm = t.read_char_data(t.DEV_DATA, char_index) if task == 'ner': list_prefix = t.read_list() gaze = t.read_list_data(t.TRAIN_DATA, list_prefix) tgaze = t.read_list_data(t.DEV_DATA, list_prefix) if USE_DEV: dev_gaze = t.read_list_data(t.TEST_DATA, list_prefix) gaze = np.vstack((gaze, dev_gaze)) else: gaze, tgaze = None, None parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_txt(conf.train_file, conf.train_num, True) devs = reader.read_txt(conf.dev_file, conf.dev_num, False) tests = reader.read_txt(conf.test_file, conf.test_num, False) trains_target = reader.read_txt(conf.train_target_file_file, conf.train_num, True) if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains) conf.use_iobes(devs) conf.use_iobes(tests) conf.build_label_idx(trains) conf.use_iobes(trains_target) conf.build_label_idx_target(trains_target) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() ids_train = conf.map_insts_ids(trains) ids_dev = conf.map_insts_ids(devs) ids_test = conf.map_insts_ids(tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)