def score_sentence(self, line): # Tokenize the input sentence into a batch of size one. tokens = tokenizer.Tokenizer.tokenize(line, self.task.dictionary, add_if_not_exist=False).long() lengths = np.array([tokens.numel()]) ds = data.TokenBlockDataset(tokens, lengths, self.args.tokens_per_sample, pad=self.task.dictionary.pad(), eos=self.task.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True) # Create a batch iterator to wrap the data. add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' itr = self.task.get_batch_iterator( dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.task.targets), max_tokens=self.args.max_tokens or 3000, max_sentences=self.args.max_sentences, max_positions=utils.resolve_max_positions(*[ model.max_positions() for model in self.models ]), num_shards=self.args.num_shards, shard_id=self.args.shard_id, ignore_invalid_inputs=True, ).next_epoch_itr(shuffle=False) # Evaluate the sentence and return the fluency score. results = self.scorer.score_batched_itr(itr, cuda=self.use_cuda) for _, _, _, hypos in results: for hypo in hypos: # Ignore words with infinite probability. This can happen when # running low-precision inference on the GPU. pos_scores = hypo['positional_scores'] word_prob = [score for score in pos_scores if score != float('-inf') and score != float('inf')] return self._fluency_score(word_prob) return 0.0
def make_batches(self, lines): token_lst = [ self.task.source_dictionary.encode_line( line, add_if_not_exist=False).long() for line in lines ] length_lst = torch.LongTensor([tokens.numel() for tokens in token_lst]) ds = data.TokenBlockDataset(token_lst, length_lst, self.args.tokens_per_sample, pad=self.task.dictionary.pad(), eos=self.task.dictionary.eos(), break_mode='eos', include_targets=True) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' itr = self.task.get_batch_iterator( dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary, add_eos_for_other_targets, shuffle=False, targets=self.task.targets), max_tokens=self.args.max_tokens or 3000, max_sentences=self.args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in self.models]), num_shards=self.args.num_shards, shard_id=self.args.shard_id, ignore_invalid_inputs=True, num_workers=self.args.num_workers, ).next_epoch_itr(shuffle=False) return itr
def make_batches(self, lines, src_dict, max_positions, tokenize=str.split): tokens = [ tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False, tokenize=tokenize).long() for src_str in lines ] lengths = np.array([t.numel() for t in tokens]) # Load dataset # MonolingualDataset[i] = source, future_target, past_target # all targets are effectively ignored during inference dataset = data.MonolingualDataset(dataset=[(s[:-1], s[1:], None) for s in tokens], sizes=lengths, src_vocab=src_dict, tgt_vocab=src_dict, add_eos_for_other_targets=False, shuffle=False) itr = self.task.get_batch_iterator( dataset=dataset, max_tokens=100, max_sentences=5, max_positions=max_positions, ).next_epoch_itr(shuffle=False) return itr
def make_batches(lines, src_dict, max_positions): tokens = [ tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long() for src_str in lines ] idx_to_words = {v: k for k, v in src_dict.indices.items()} lengths = np.array([t.numel() for t in tokens]) itr = data.EpochBatchIterator( dataset=data.MonolingualDataset([(s[:-1], s[1:]) for s in tokens], lengths, src_dict, False), max_tokens=100, max_sentences=5, max_positions=max_positions, ).next_epoch_itr(shuffle=False) return itr