Python MonolingualDataset Exemples, fairseq.data.MonolingualDataset Python Exemples

Exemple #1

0

Afficher le fichier

    def score_sentence(self, line):
        # Tokenize the input sentence into a batch of size one.
        tokens = tokenizer.Tokenizer.tokenize(line, self.task.dictionary, add_if_not_exist=False).long()
        lengths = np.array([tokens.numel()])
        ds = data.TokenBlockDataset(tokens, lengths, self.args.tokens_per_sample, pad=self.task.dictionary.pad(),
                                    eos=self.task.dictionary.eos(), break_mode=self.args.sample_break_mode,
                                    include_targets=True)

        # Create a batch iterator to wrap the data.
        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
        itr = self.task.get_batch_iterator(
            dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary,
                                            add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False,
                                            targets=self.task.targets),
            max_tokens=self.args.max_tokens or 3000,
            max_sentences=self.args.max_sentences,
            max_positions=utils.resolve_max_positions(*[
                model.max_positions() for model in self.models
            ]),
            num_shards=self.args.num_shards,
            shard_id=self.args.shard_id,
            ignore_invalid_inputs=True,
        ).next_epoch_itr(shuffle=False)

        # Evaluate the sentence and return the fluency score.
        results = self.scorer.score_batched_itr(itr, cuda=self.use_cuda)
        for _, _, _, hypos in results:
            for hypo in hypos:
                # Ignore words with infinite probability. This can happen when
                # running low-precision inference on the GPU.
                pos_scores = hypo['positional_scores']
                word_prob = [score for score in pos_scores if score != float('-inf') and score != float('inf')]
                return self._fluency_score(word_prob)
        return 0.0

Exemple #2

0

Afficher le fichier

    def make_batches(self, lines):
        token_lst = [
            self.task.source_dictionary.encode_line(
                line, add_if_not_exist=False).long() for line in lines
        ]
        length_lst = torch.LongTensor([tokens.numel() for tokens in token_lst])

        ds = data.TokenBlockDataset(token_lst,
                                    length_lst,
                                    self.args.tokens_per_sample,
                                    pad=self.task.dictionary.pad(),
                                    eos=self.task.dictionary.eos(),
                                    break_mode='eos',
                                    include_targets=True)
        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
        itr = self.task.get_batch_iterator(
            dataset=data.MonolingualDataset(ds,
                                            ds.sizes,
                                            self.task.dictionary,
                                            self.task.target_dictionary,
                                            add_eos_for_other_targets,
                                            shuffle=False,
                                            targets=self.task.targets),
            max_tokens=self.args.max_tokens or 3000,
            max_sentences=self.args.max_sentences,
            max_positions=utils.resolve_max_positions(
                *[model.max_positions() for model in self.models]),
            num_shards=self.args.num_shards,
            shard_id=self.args.shard_id,
            ignore_invalid_inputs=True,
            num_workers=self.args.num_workers,
        ).next_epoch_itr(shuffle=False)

        return itr

Exemple #3

0

Afficher le fichier

    def make_batches(self, lines, src_dict, max_positions, tokenize=str.split):
        tokens = [
            tokenizer.Tokenizer.tokenize(src_str,
                                         src_dict,
                                         add_if_not_exist=False,
                                         tokenize=tokenize).long()
            for src_str in lines
        ]
        lengths = np.array([t.numel() for t in tokens])

        # Load dataset
        # MonolingualDataset[i] = source, future_target, past_target
        # all targets are effectively ignored during inference
        dataset = data.MonolingualDataset(dataset=[(s[:-1], s[1:], None)
                                                   for s in tokens],
                                          sizes=lengths,
                                          src_vocab=src_dict,
                                          tgt_vocab=src_dict,
                                          add_eos_for_other_targets=False,
                                          shuffle=False)
        itr = self.task.get_batch_iterator(
            dataset=dataset,
            max_tokens=100,
            max_sentences=5,
            max_positions=max_positions,
        ).next_epoch_itr(shuffle=False)

        return itr

Exemple #4

0

Afficher le fichier

def make_batches(lines, src_dict, max_positions):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        for src_str in lines
    ]
    idx_to_words = {v: k for k, v in src_dict.indices.items()}
    lengths = np.array([t.numel() for t in tokens])
    itr = data.EpochBatchIterator(
        dataset=data.MonolingualDataset([(s[:-1], s[1:]) for s in tokens], lengths, src_dict, False),
        max_tokens=100,
        max_sentences=5,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    return itr