def eval(args): batch_size = 32 train_on_gpu = torch.cuda.is_available() enc = RNNEncoder(300, args.embedding_file) dec = RNNDecoder(300, args.embedding_file) device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') model = Seq2Seq(enc, dec, device).to(device) ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['state_dict']) model.eval() embedding_matrix = pickle.load(open(args.embedding_file, 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding_matrix.vocab) eval_data = pickle.load(open(args.test_data_path, 'rb')) eval_loader = DataLoader(eval_data, batch_size=batch_size, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn) output_file = open(args.output_path, 'w') val_losses = [] prediction = {} for batch in tqdm(eval_loader): pred = model(batch, 0) pred = torch.argmax(pred, dim=2) # batch, seq_len for i in range(len(pred)): prediction[batch['id'][i]] = tokenizer.decode( pred[i]).split('</s>')[0].split(' ', 1)[1] pred_output = [ json.dumps({ 'id': key, 'predict': value }) for key, value in sorted(prediction.items(), key=lambda item: item[0]) ] output_file.write('\n'.join(pred_output)) output_file.write('\n') output_file.close()
def eval(args): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #print(device) BATCH_SIZE = 32 ENC_HID_DIM = 128 DEC_HID_DIM = 128 N_LAYERS = 1 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 PADDING_INDEX = 0 embedding = pickle.load(open(args.embedding_file, 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) embedding_matrix = embedding.vectors.to(device) output_dim = len(embedding.vectors) embedding_dim = 300 attn = Attention(ENC_HID_DIM, DEC_HID_DIM) encoder = Encoder(embedding_dim, ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, ENC_DROPOUT) decoder = Decoder(output_dim, embedding_dim, ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, DEC_DROPOUT, attn) model = Seq2Seq(encoder, decoder, PADDING_INDEX, device).to(device) optimizer = torch.optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=PADDING_INDEX) ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['state_dict']) model.eval() eval_data = pickle.load(open(args.test_data_path, 'rb')) eval_loader = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn) output_file = open(args.output_path, 'w') val_losses = [] prediction={} for batch in tqdm(eval_loader): #print(text.size()) pred ,attention= model(batch, 0) #print(pred.size()) pred = torch.argmax(pred, dim=2) #print(pred.size()) pred = pred.permute(1, 0) #print(pred.size()) for i in range(len(pred)): prediction[batch['id'][i]] = tokenizer.decode(pred[i]).split('</s>')[0].split(' ',1)[1] pred_output = [json.dumps({'id':key, 'predict': value}) for key, value in sorted(prediction.items(), key=lambda item: item[0])] output_file.write('\n'.join(pred_output)) output_file.write('\n') output_file.close()
class Summarizer: """Use a test model to generate fielded query sentences from documents""" def __init__(self, f_abs, n_best=1, min_length=1, max_length=50, beam_size=4, bert_model='bert-base-uncased'): self.n_best = n_best self.min_length = min_length self.max_length = max_length self.beam_size = beam_size self.abs_model = self.load_abs_model(f_abs) self.eval() logger.info(f'Loading BERT Tokenizer [{bert_model}]...') self.tokenizerB = BertTokenizer.from_pretrained('bert-base-uncased') self.spt_ids_B, self.spt_ids_C, self.eos_mapping = get_special_tokens() logger.info('Loading custom Tokenizer for using WBMET embeddings') self.tokenizerC = Tokenizer(self.abs_model.args.vocab_size) self.tokenizerC.from_pretrained(self.abs_model.args.file_dec_emb) @staticmethod def load_abs_model(f_abs): """Load a pre-trained abs model""" logger.info(f'Loading an abstractive test model from {f_abs}...') data = torch.load(f_abs, map_location=lambda storage, loc: storage) mdl = AbstractiveSummarizer(data['args']) mdl.load_state_dict(data['model']).cuda() return mdl def translate(self, docs): """Translate a batch of documents.""" batch_size = docs.inp.size(0) spt_ids = self.spt_ids_C decode_strategy = BeamSearch(self.beam_size, batch_size, self.n_best, self.min_length, self.max_length, spt_ids, self.eos_mapping) return self._translate_batch_with_strategy(docs, decode_strategy) def _translate_batch_with_strategy(self, batch, decode_strategy): """Translate a batch of documents step by step using cache :param batch (dict): A batch of documentsj :param decode_strategy (DecodeStrategy): A decode strategy for generating translations step by step. I.e., BeamSearch """ # (1) Run the encoder on the src ext_scores, hidden_states = \ self.abs_model.encoder(batch.inp, attention_mask=batch.mask_inp, token_type_ids=batch.segs) # (2) Prepare decoder and decode_strategy self.abs_model.decoder.init_state(batch.inp) field_signals = batch.tgt[:, 0] fn_map_state, memory_bank, memory_pad_mask = \ decode_strategy.initialize(hidden_states[-1], batch.src_lens, field_signals) if fn_map_state is not None: self.abs_model.decoder.map_state(fn_map_state) # (3) Begin decoding step by step: for step in range(decode_strategy.max_length): decoder_input = decode_strategy.current_predictions.unsqueeze(-1) dec_out, attns = self.abs_model.decoder(decoder_input, memory_bank, memory_pad_mask, step=step) log_probs = self.abs_model.generator(dec_out[:, -1, :].squeeze(1)) # Beam advance decode_strategy.advance(log_probs, attns) any_finished = decode_strategy.is_finished.any() if any_finished: decode_strategy.update_finished() if decode_strategy.done: break select_indices = decode_strategy.select_indices if any_finished: # Reorder states. memory_bank = memory_bank.index_select(0, select_indices) memory_pad_mask = memory_pad_mask.index_select( 0, select_indices) if self.beam_size > 1 or any_finished: self.abs_model.decoder.map_state( lambda state, dim: state.index_select(dim, select_indices)) res = { 'batch': batch, 'gold_scores': self._gold_score(batch, hidden_states[-1], batch.mask_inp), 'scores': decode_strategy.scores, 'predictions': decode_strategy.predictions, 'ext_scores': ext_scores, 'attentions': decode_strategy.attention } return res def results_to_translations(self, data): """Convert results into Translation object""" batch = data['batch'] translations = [] for i, did in enumerate(batch.did): src_input_ = batch.inp[i] src_ = self.tokenizerB.decode(src_input_) topic_ = \ self.tokenizerC.convert_id_to_token(batch.tgt[i][0].item()) pred_sents_ = [ self.tokenizerC.decode(data['predictions'][i][n]) for n in range(self.n_best) ] gold_sent_ = self.tokenizerC.decode(batch.tgt[i]) x = Translation(did=did, src_input=src_input_, src=src_, topic=topic_, ext_scores=data['ext_scores'][i], pred_sents=pred_sents_, pred_scores=data['scores'][i], gold_sent=gold_sent_, gold_score=data['gold_scores'][i], attentions=data['attentions'][i]) translations.append(x) return translations def _gold_score(self, batch, memory_bank, memory_pad_mask): if hasattr(batch, 'tgt'): gs = self._score_target(batch, memory_bank, memory_pad_mask) self.abs_model.decoder.init_state(batch.inp) else: gs = [0] * batch.batch_size return gs def _score_target(self, batch, memory_bank, memory_pad_mask): tgt_in = batch.tgt[:, :-1] dec_out, _ = self.abs_model.decoder(tgt_in, memory_bank, memory_pad_mask) log_probs = self.abs_model.generator(dec_out) gold = batch.tgt[:, 1:] tgt_pad_mask = gold.eq(self.spt_ids_C['[PAD]']) log_probs[tgt_pad_mask] = 0 gold_scores = log_probs.gather(2, gold.unsqueeze(-1)) gold_scores = gold_scores.sum(dim=1).view(-1) return gold_scores.tolist() def eval(self): self.abs_model.eval()
with open(hparams.embedding_path, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) dataLoader = DataLoader(dataset, hparams.batch_size, shuffle=False, collate_fn=dataset.collate_fn) model = Model(hparams) model.load(hparams.load_model_path) st_time = datetime.now() predicts = model.eval(dataLoader) results = [] for predict in predicts: res = { "id": predict["id"], "predict": tokenizer.decode(predict["predict"]) } results.append(res) with open(args.store_path, 'w') as fp: for result in results: s = json.dumps(result) fp.write(f"{s}\n") print(f"Cost time: {datetime.now()-st_time}")