Python collate_tokens Examples, fairseq.data.data_utils.collate_tokens Python Examples

Example #1

0

Show file

File: dptree2seq_dataset.py Project: MaratSaidov/source-code-summarization

    def merge_source(left_pad, move_eos_to_beginning=False):
        # src = [s['source'] for s in samples]
        assert samples[0]['source'] is not None
        src = {
            k: [dic['source'][k] for dic in samples]
            for k in samples[0]['source']
        }

        nodes = src['nodes']
        labels = src['labels']
        indices = src['indices']
        length = src['length']

        nodes = data_utils.collate_tokens(nodes, pad_idx, eos_idx, left_pad,
                                          move_eos_to_beginning)
        labels = data_utils.collate_tokens(labels, pad_idx, eos_idx, left_pad,
                                           move_eos_to_beginning)
        indices = dptree2seq_collate_indices(indices, 0, 0, left_pad,
                                             move_eos_to_beginning)
        length = torch.cat([x.unsqueeze_(0) for x in length], 0)

        src_o = {
            'nodes': nodes,
            'labels': labels,
            'indices': indices,
            'length': length
        }
        return src_o

Example #2

0

Show file

    def evaluate_aspect(self):
        assert 'val' in self._aspect_dataset
        self._bart.set_mode('train')
        self._bart.eval()

        loss_list = []
        for i in range(0, len(self._aspect_dataset['val']), LIL_BATCH_SIZE):
            batch = self._aspect_dataset['val'][i:i + LIL_BATCH_SIZE]

            src_lengths = torch.tensor([len(t.src_tokens) for t in batch])
            src_tokens = collate_tokens([t.src_tokens for t in batch],
                                        pad_idx=self._bart.dictionary.pad())
            tgt_tokens = collate_tokens([t.tgt_tokens for t in batch],
                                        pad_idx=self._bart.dictionary.pad())

            tgt_labels = collate_tokens([t.tgt_labels for t in batch],
                                        pad_idx=self._bart.pad_label_index)

            with torch.no_grad():
                seq2seq_loss, seqlab_loss = self._get_both_loss(
                    src_lengths=src_lengths,
                    src_tokens=src_tokens,
                    tgt_tokens=tgt_tokens,
                    tgt_labels=tgt_labels)
                loss = seq2seq_loss + seqlab_loss

            loss_list.append(loss.item())

        return sum(loss_list) / len(loss_list)

Example #3

0

Show file

    def evaluate(self):
        assert 'dev' in self._dataset
        self._model.split_to_gpus(n_gpus=1)
        self._model.eval()

        loss_list = []
        for i in trange(0,
                        len(self._dataset['dev']),
                        LIL_BATCH_SIZE,
                        desc='Evaluating on Dev Set'):
            batch = self._dataset['dev'][i:i + LIL_BATCH_SIZE]

            src_lengths = torch.tensor([len(t.src_tokens) for t in batch])
            src_tokens = collate_tokens([t.src_tokens for t in batch],
                                        pad_idx=self._model.dictionary.pad())
            tgt_tokens = collate_tokens([t.tgt_tokens for t in batch],
                                        pad_idx=self._model.dictionary.pad())

            with torch.no_grad():
                loss = self._get_label_smoothed_nll_loss(
                    src_lengths=src_lengths,
                    src_tokens=src_tokens,
                    tgt_tokens=tgt_tokens,
                    epsilon=0.)

            loss_list.append(loss.item())

        return sum(loss_list) / len(loss_list)

Example #4

0

Show file

File: collaters.py Project: kayoyin/DialogueMT

    def collate(self, samples):
        """
        utility function to collate samples into batch for speech recognition.
        """
        if len(samples) == 0:
            return {}

        id = torch.LongTensor([s["id"] for s in samples])
        source = data_utils.collate_tokens([s["source"] for s in samples], self.pad_index, eos_idx=self.eos_index)
        target = data_utils.collate_tokens([s["target"] for s in samples], self.pad_index, eos_idx=self.eos_index)
        
        prev_output_tokens = data_utils.collate_tokens(
                [s["target"] for s in samples],
                self.pad_index,
                self.eos_index,
                left_pad=False,
                move_eos_to_beginning=True,
            )
        # print("tgt ",target[0])
        # print("prev ",prev_output_tokens[0])
        batch = {
            "id": id,
            "ntokens": sum(len(s["target"]) for s in samples),
            "net_input": {"src_tokens": source, "src_lengths": torch.LongTensor([s.size(0) for s in source]), "prev_output_tokens":prev_output_tokens},
            "target": target,
            "nsentences": len(samples),
        }
        return batch

Example #5

0

Show file

    def _collate_target(
            self, samples: List[SpeechToSpeechDatasetItem]) -> torch.Tensor:
        if self.target_is_code:
            target = fairseq_data_utils.collate_tokens(
                [x.target for x in samples],
                self.tgt_dict.pad(),
                self.tgt_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=False,
            )
            # convert stacked units to a single id
            pack_targets = [self.pack_units(x.target) for x in samples]
            prev_output_tokens = fairseq_data_utils.collate_tokens(
                pack_targets,
                self.tgt_dict.pad(),
                self.tgt_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=True,
            )
            target_lengths = torch.tensor([x.size(0) for x in pack_targets],
                                          dtype=torch.long)
        else:
            target = _collate_frames([x.target for x in samples],
                                     is_audio_input=False)
            bsz, _, d = target.size()
            prev_output_tokens = torch.cat((target.new_full(
                (bsz, 1, d), 0.0), target[:, :-1, :]),
                                           dim=1)
            target_lengths = torch.tensor([x.target.size(0) for x in samples],
                                          dtype=torch.long)

        return target, prev_output_tokens, target_lengths

Example #6

0

Show file

    def collater(self, samples: List[torch.Tensor]) -> torch.Tensor:
        out = fairseq_data_utils.collate_tokens(
            samples,
            self.dict.pad(),
            self.dict.eos(),
            left_pad=False,
            move_eos_to_beginning=False,
        ).long()

        prev_out = fairseq_data_utils.collate_tokens(
            samples,
            self.dict.pad(),
            self.dict.eos(),
            left_pad=False,
            move_eos_to_beginning=True,
        ).long()

        target_lengths = torch.tensor([t.size(0) for t in samples],
                                      dtype=torch.long)
        ntokens = sum(t.size(0) for t in samples)

        output = {
            "prev_output_tokens": prev_out,
            "target": out,
            "target_lengths": target_lengths,
            "ntokens": ntokens,
        }

        return output

Example #7

0

Show file

 def _prepare_batch_for_alignment(self, sample, hypothesis):
     src_tokens = sample["net_input"]["src_tokens"]
     bsz = src_tokens.shape[0]
     src_tokens = (src_tokens[:, None, :].expand(-1, self.beam_size,
                                                 -1).contiguous().view(
                                                     bsz * self.beam_size,
                                                     -1))
     src_lengths = sample["net_input"]["src_lengths"]
     src_lengths = (src_lengths[:, None].expand(
         -1, self.beam_size).contiguous().view(bsz * self.beam_size))
     prev_output_tokens = data_utils.collate_tokens(
         [beam["tokens"] for example in hypothesis for beam in example],
         self.pad,
         self.eos,
         self.left_pad_target,
         move_eos_to_beginning=True,
     )
     tgt_tokens = data_utils.collate_tokens(
         [beam["tokens"] for example in hypothesis for beam in example],
         self.pad,
         self.eos,
         self.left_pad_target,
         move_eos_to_beginning=False,
     )
     return src_tokens, src_lengths, prev_output_tokens, tgt_tokens

Example #8

0

Show file

File: inference.py Project: noe/iterative_expansion_lms

    def inference_step(self,
                       model: nn.Module,
                       previous_level_tokens: List[List[int]],
                       previous_level_heads: List[List[int]],
                       expansion_sampling: Callable[[torch.Tensor], torch.LongTensor] = None,
                       given_next_level_expansions: List[List[int]] = None,
                       ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
        """

        :param model:
        :param previous_level_tokens:
        :param previous_level_heads:
        :param expansion_sampling:
        :param given_next_level_expansions:
        :return: Tuples of token logits, token probabilities, expansion logits
                 and expansion probabilities.
        """

        prev_tokens = collate_tokens([torch.LongTensor(p)
                                      for p in previous_level_tokens],
                                     self.pad_idx,
                                     eos_idx=None,
                                     left_pad=False)

        # Index -1 is used for the root node. We shift positions so that
        # the original -1 is now pad_idx + 1
        head_pos_shift = self.pad_idx + 2

        head_positions = collate_tokens([torch.LongTensor(heads) + head_pos_shift
                                         for heads in previous_level_heads],
                                        self.pad_idx,
                                        eos_idx=None,
                                        left_pad=False)

        previous_level_dependency_masks = [heads2causality_mask(heads)
                                           for heads in previous_level_heads]
        prev_causality_masks = collate_masks(
                                         [1 - torch.ByteTensor(m).permute(1, 0)
                                          for m in previous_level_dependency_masks],
                                         self.pad_idx)

        next_level_expansions = (None if given_next_level_expansions is None
                                 else collate_tokens([torch.LongTensor(n)
                                                      for n in given_next_level_expansions],
                                                     self.pad_idx,
                                                     eos_idx=None,
                                                     left_pad=False).to(self.device))

        net_input = {
                     KEY_PREV_LEVEL_TOKENS: prev_tokens.to(self.device),
                     KEY_CAUSALITY_MASK: prev_causality_masks.to(self.device),
                     KEY_HEAD_POSITIONS: head_positions.to(self.device),
                     KEY_NEXT_LEVEL_EXPANS: next_level_expansions,
                     'expansion_sampling': expansion_sampling,
                     }

        token_logits, expansion_logits, expansion_ids = model(**net_input)

        return token_logits, expansion_logits, expansion_ids

Example #9

0

Show file

File: text_and_image_dataset.py Project: wh0330/OpenViDial

    def collater(self, samples):
        """Merge a list of samples to form a mini-batch."""
        if len(samples) == 0:
            return {}
        indices = []
        source_imgs = []
        source_texts = []
        source_lengths = []
        targets = []

        target_ntokens = 0

        for sample in samples:
            index = sample['id']
            indices.append(index)

            source_imgs.append(sample['source_imgs'])
            source_texts.append(sample['source_texts'])
            source_lengths.append(len(sample['source_texts']))

            targets.append(sample['target'])
            target_ntokens += len(sample["target"])

        num_sentences = len(samples)

        indices = torch.tensor(indices, dtype=torch.long)

        max_sent = max(x.size(0) for x in source_imgs)
        pad_imgs = torch.zeros([num_sentences, max_sent, self.img_dataset.dim], dtype=torch.float)
        for idx, imgs in enumerate(source_imgs):
            pad_imgs[idx][: imgs.size(0)] = imgs

        source_texts_batch = data_utils.collate_tokens(source_texts,
                                                       pad_idx=self.vocab_dict.pad(),
                                                       eos_idx=self.vocab_dict.eos(),
                                                       move_eos_to_beginning=False)

        target_batch = data_utils.collate_tokens(targets,
                                                 pad_idx=self.vocab_dict.pad(),
                                                 eos_idx=self.vocab_dict.eos(),
                                                 move_eos_to_beginning=False)
        prev_target_batch = data_utils.collate_tokens(targets,
                                                      pad_idx=self.vocab_dict.pad(),
                                                      eos_idx=self.vocab_dict.eos(),
                                                      move_eos_to_beginning=True)

        return {
            'id': indices,
            'net_input': {
                'src_tokens': source_texts_batch,
                'src_imgs': pad_imgs,
                'src_lengths': source_lengths,
                'prev_output_tokens': prev_target_batch,
            },
            'target': target_batch,
            'ntokens': target_ntokens,
            'nsentences': num_sentences,
        }

Example #10

0

Show file

    def collater(self, samples):
        indices = []

        source_feature_samples = []
        source_location_samples = []
        source_lengths = []

        target_samples = []
        target_ntokens = 0

        for sample in samples:
            index = sample['id']
            indices.append(index)

            source_feature_samples.append(sample['source_features'])
            source_location_samples.append(sample['source_locations'])
            source_lengths.append(self.img_ds.sizes[index])

            target_samples.append(sample['target'])
            target_ntokens += self.cap_ds.sizes[index]

        num_sentences = len(samples)

        # FIXME: workaround for edge case in parallel processing
        # (framework passes empty samples list
        # to collater under certain conditions)
        if num_sentences == 0:
            return None

        indices = torch.tensor(indices, dtype=torch.long)

        source_feature_batch, source_location_batch = \
            self.img_ds.collater(list(zip(source_feature_samples, source_location_samples)))

        target_batch = data_utils.collate_tokens(target_samples,
                                                 pad_idx=self.cap_dict.pad(),
                                                 eos_idx=self.cap_dict.eos(),
                                                 move_eos_to_beginning=False)
        rotate_batch = data_utils.collate_tokens(target_samples,
                                                 pad_idx=self.cap_dict.pad(),
                                                 eos_idx=self.cap_dict.eos(),
                                                 move_eos_to_beginning=True)

        return {
            'id': indices,
            'net_input': {
                'src_tokens': source_feature_batch,
                'src_locations': source_location_batch,
                'src_lengths': source_lengths,
                'prev_output_tokens': rotate_batch,
            },
            'target': target_batch,
            'ntokens': target_ntokens,
            'nsentences': num_sentences,
        }

Example #11

0

Show file

File: speech_to_text_dataset.py Project: xuhu357/fairseq

    def collater(
        self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
    ) -> Dict:
        if len(samples) == 0:
            return {}
        indices = torch.tensor([x.index for x in samples], dtype=torch.long)
        frames = _collate_frames([x.source for x in samples], self.cfg.use_audio_input)
        # sort samples by descending number of frames
        n_frames = torch.tensor([x.source.size()[0] for x in samples], dtype=torch.long)
        n_frames, order = n_frames.sort(descending=True)
        indices = indices.index_select(0, order)
        frames = frames.index_select(0, order)

        target, target_lengths = None, None
        prev_output_tokens = None
        ntokens = None
        if self.tgt_texts is not None:
            target = fairseq_data_utils.collate_tokens(
                [x.target for x in samples],
                self.tgt_dict.pad(),
                self.tgt_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=False,
            )
            target = target.index_select(0, order)
            target_lengths = torch.tensor(
                [x.target.size()[0] for x in samples], dtype=torch.long
            ).index_select(0, order)
            prev_output_tokens = fairseq_data_utils.collate_tokens(
                [x.target for x in samples],
                self.tgt_dict.pad(),
                self.tgt_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=True,
            )
            prev_output_tokens = prev_output_tokens.index_select(0, order)
            ntokens = sum(x.target.size()[0] for x in samples)

        net_input = {
            "src_tokens": frames,
            "src_lengths": n_frames,
            "prev_output_tokens": prev_output_tokens,
        }
        out = {
            "id": indices,
            "net_input": net_input,
            "target": target,
            "target_lengths": target_lengths,
            "ntokens": ntokens,
            "nsentences": len(samples),
        }
        if return_order:
            out["order"] = order
        return out

Example #12

0

Show file

def fair_seq_no_neutral_sent_pair_classification(sentence_pairs, model,
                                                 gpu_available, logger):
    if gpu_available:
        model.cuda()
        logger.info("successfully moved model to gpu")

    model.eval()

    avg_responses = {}
    counter = 0
    for key, corr_incorr_pair in sentence_pairs.items():
        avg_responses[key] = {
            'correct': {
                'label_list': [],
                'avg_accuracy': -1
            },
            'incorrect': {
                'label_list': [],
                'avg_accuracy': -1
            }
        }
        # Correct pair (true label: entailment) results
        batch = collate_tokens([
            model.encode(pair[0], pair[1])
            for pair in corr_incorr_pair['correct']
        ],
                               pad_idx=1)
        logprobs = model.predict('sentence_classification_head', batch)

        result_list = logprobs.argmax(dim=1).tolist()
        avg_accuracy = result_list.count(1) / len(result_list)

        avg_responses[key]['correct']['label_list'] = result_list
        avg_responses[key]['correct']['avg_accuracy'] = avg_accuracy

        # Incorrect pair (true label: contradiction) results
        batch = collate_tokens([
            model.encode(pair[0], pair[1])
            for pair in corr_incorr_pair['incorrect']
        ],
                               pad_idx=1)
        logprobs = model.predict('sentence_classification_head', batch)

        result_list = logprobs.argmax(dim=1).tolist()
        avg_accuracy = result_list.count(0) / len(result_list)

        avg_responses[key]['incorrect']['label_list'] = result_list
        avg_responses[key]['incorrect']['avg_accuracy'] = avg_accuracy

        counter += 1
        if counter % 240 == 0:
            logger.info("finished 10 more")

    return avg_responses

Example #13

0

Show file

File: speech_to_text_dataset.py Project: gorokoba560/norm-analysis-of-transformer

    def collater(
            self, samples: List[Tuple[int, torch.Tensor,
                                      torch.Tensor]]) -> Dict:
        if len(samples) == 0:
            return {}
        indices = torch.tensor([i for i, _, _ in samples], dtype=torch.long)
        frames = _collate_frames([s for _, s, _ in samples],
                                 self.data_cfg.use_audio_input)
        # sort samples by descending number of frames
        n_frames = torch.tensor([s.size(0) for _, s, _ in samples],
                                dtype=torch.long)
        n_frames, order = n_frames.sort(descending=True)
        indices = indices.index_select(0, order)
        frames = frames.index_select(0, order)

        target, target_lengths = None, None
        prev_output_tokens = None
        ntokens = None
        if self.tgt_texts is not None:
            target = fairseq_data_utils.collate_tokens(
                [t for _, _, t in samples],
                self.tgt_dict.pad(),
                self.tgt_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=False,
            )
            target = target.index_select(0, order)
            target_lengths = torch.tensor([t.size(0) for _, _, t in samples],
                                          dtype=torch.long).index_select(
                                              0, order)
            prev_output_tokens = fairseq_data_utils.collate_tokens(
                [t for _, _, t in samples],
                self.tgt_dict.pad(),
                self.tgt_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=True,
            )
            prev_output_tokens = prev_output_tokens.index_select(0, order)
            ntokens = sum(t.size(0) for _, _, t in samples)

        out = {
            "id": indices,
            "net_input": {
                "src_tokens": frames,
                "src_lengths": n_frames,
                "prev_output_tokens": prev_output_tokens,
            },
            "target": target,
            "target_lengths": target_lengths,
            "ntokens": ntokens,
            "nsentences": len(samples),
        }
        return out

Example #14

0

Show file

File: tapt_trainer.py Project: UCSD-AI4H/COVID-Dialogue

    def train_epoch(self, batch_size, label_smooth_epsilon):
        assert 'train' in self._dataset

        random.shuffle(self._dataset['train'])
        print_train_loss = 0.0
        num = 0
        for i in trange(0,
                        len(self._dataset['train']),
                        batch_size,
                        desc='BART Training'):
            self._model.split_to_gpus(n_gpus=min(2, torch.cuda.device_count()))
            self._model.train()

            batch = self._dataset['train'][i:i + batch_size]

            self._optimizer.zero_grad()

            for j in range(0, len(batch), LIL_BATCH_SIZE):
                lil_batch = batch[j:j + LIL_BATCH_SIZE]

                src_lengths = torch.tensor(
                    [len(t.src_tokens) for t in lil_batch])
                src_tokens = collate_tokens(
                    [t.src_tokens for t in lil_batch],
                    pad_idx=self._model.dictionary.pad())
                tgt_tokens = collate_tokens(
                    [t.tgt_tokens for t in lil_batch],
                    pad_idx=self._model.dictionary.pad())

                loss = self._get_label_smoothed_nll_loss(
                    src_lengths=src_lengths,
                    src_tokens=src_tokens,
                    tgt_tokens=tgt_tokens,
                    epsilon=label_smooth_epsilon)
                num += 1
                loss = loss * len(lil_batch) / batch_size
                print_train_loss += loss * batch_size

                if torch.isnan(loss):
                    print('warning: nan loss')
                    print(f'tgt_text: {lil_batch[0].tgt_text}')
                else:
                    loss.backward()

            self._optimizer.step()
            self._lr_scheduler.step()

            self._global_step += 1
            if self._global_step % self._eval_steps == 0:
                self.gen_log()
        print("Training loss:", print_train_loss.item() / num)

Example #15

0

Show file

def default_collater(target_dict, samples, dataset=None):
    if not samples:
        return None
    if any([sample is None for sample in samples]):
        if not dataset:
            return None
        len_batch = len(samples)
        while True:
            samples.append(dataset[random.choice(range(len(dataset)))])
            samples = list(filter(lambda x: x is not None, samples))
            if len(samples) == len_batch:
                break
    indices = []

    imgs = []  # bs, c, h , w
    target_samples = []
    target_ntokens = 0

    for sample in samples:
        index = sample['id']
        indices.append(index)

        imgs.append(sample['tfm_img'])

        target_samples.append(sample['label_ids'].long())
        target_ntokens += len(sample['label_ids'])

    num_sentences = len(samples)

    target_batch = data_utils.collate_tokens(target_samples,
                                             pad_idx=target_dict.pad(),
                                             eos_idx=target_dict.eos(),
                                             move_eos_to_beginning=False)
    rotate_batch = data_utils.collate_tokens(target_samples,
                                             pad_idx=target_dict.pad(),
                                             eos_idx=target_dict.eos(),
                                             move_eos_to_beginning=True)

    indices = torch.tensor(indices, dtype=torch.long)
    imgs = torch.stack(imgs, dim=0)

    return {
        'id': indices,
        'net_input': {
            'imgs': imgs,
            'prev_output_tokens': rotate_batch
        },
        'ntokens': target_ntokens,
        'nsentences': num_sentences,
        'target': target_batch
    }

Example #16

0

Show file

def get_dummy_input(T=100, D=80, B=5, K=100):
    forward_input = {}
    # T max sequence length
    # D feature vector dimension
    # B batch size
    # K target dimension size
    feature = torch.randn(B, T, D)
    # this (B, T, D) layout is just a convention, you can override it by
    # write your own _prepare_forward_input function
    src_lengths = torch.from_numpy(
        np.random.randint(low=1, high=T, size=B, dtype=np.int64))
    src_lengths[0] = T  # make sure the maximum length matches
    prev_output_tokens = []
    for b in range(B):
        token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1)
        tokens = np.random.randint(low=0,
                                   high=K,
                                   size=token_length,
                                   dtype=np.int64)
        prev_output_tokens.append(torch.from_numpy(tokens))

    prev_output_tokens = fairseq_data_utils.collate_tokens(
        prev_output_tokens,
        pad_idx=1,
        eos_idx=2,
        left_pad=False,
        move_eos_to_beginning=False,
    )
    src_lengths, sorted_order = src_lengths.sort(descending=True)
    forward_input["src_tokens"] = feature.index_select(0, sorted_order)
    forward_input["src_lengths"] = src_lengths
    forward_input["prev_output_tokens"] = prev_output_tokens

    return forward_input

Example #17

0

Show file

File: masked_language_pair_dataset.py Project: ljw9609/NMT-MASS

 def merge(key, left_pad):
     return data_utils.collate_tokens(
         [s[key] for s in samples],
         pad_idx,
         eos_idx,
         left_pad,
     )

Example #18

0

Show file

 def collater_seq_label(self, targets, pad):
     lengths = torch.LongTensor([len(t) for t in targets])
     ntokens = lengths.sum().item()
     targets = data_utils.collate_tokens(targets,
                                         pad_idx=pad,
                                         left_pad=False)
     return targets, lengths, ntokens

Example #19

0

Show file

    def _collate_fn(self, items: List[List[torch.Tensor]]):
        # we don't use fairseq's batching functionality, so we expect a single
        # Tensor of type List[torch.Tensor]
        assert len(items) == 1

        # item will have shape B x T (the last batch may have length < T)
        id, item = items[0]
        item = data_utils.collate_tokens(item,
                                         pad_idx=self.source_dictionary.pad())
        B, T = item.size()

        # shift item one position over and append a padding token for the target
        target = torch.nn.functional.pad(item[:, 1:], (0, 1, 0, 0),
                                         value=self.target_dictionary.pad())

        # fairseq expects batches to have the following structure
        return {
            "id": torch.tensor([id] * item.size(0)),
            "net_input": {
                "src_tokens": item,
            },
            "target": target,
            "nsentences": item.size(0),
            "ntokens": item.numel(),
        }

Example #20

0

Show file

def get_roberta_preds(claim, evidences):
    batch_of_pairs = [[claim, evidence] for evidence in evidences]

    # batch_of_pairs = [
    #     ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
    #     ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
    #     ['potatoes are awesome.', 'I like to run.'],
    #     ['Mars is very far from earth.', 'Mars is very close.'],
    #     ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
    #     ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
    #     ['potatoes are awesome.', 'I like to run.'],
    #     ['Mars is very far from earth.', 'Mars is very close.'],
    #     ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
    #     ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.']
    # ]

    batch = collate_tokens(
        [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
    )

    logprobs = roberta.predict('mnli', batch)

    pred_dict = {
        0: 'contradiction',
        1: 'neutral',
        2: 'entailment'
    }
    pred_indices = logprobs.argmax(dim=1).tolist()
    preds = [pred_dict[i] for i in pred_indices]
    print(preds)
    return preds

Example #21

0

Show file

    def extract_batch(self, sentence_string_batch):
        bert_data = {}
        bert_data["word_features"] = []
        bert_data["wordpieces_roberta"] = []
        bert_data["word2piece_scattered_indices"] = []
        src_wordpieces = []
        src_word2piece = []
        for sentence in sentence_string_batch:
            word2piece = get_wordpiece_to_word_map(sentence, self.roberta.bpe)
            wordpieces_roberta = self.roberta.encode(sentence)
            wordpieces_roberta = wordpieces_roberta[:512]
            src_wordpieces.append(copy.deepcopy(wordpieces_roberta))
            src_word2piece.append(copy.deepcopy(word2piece))

        src_wordpieces_collated = collate_tokens(src_wordpieces, pad_idx=1)
        roberta_batch_features = self.extract_features(src_wordpieces_collated)
        roberta_batch_features = roberta_batch_features.detach().cpu()
        for index,(word2piece, wordpieces_roberta) in enumerate(zip(src_word2piece, src_wordpieces)):
            roberta_features = roberta_batch_features[index]
            roberta_features = roberta_features[1:len(wordpieces_roberta)-1]
            word_features = get_average_embeddings(roberta_features.unsqueeze(0), word2piece)
            word2piece_scattered_indices = get_scatter_indices(word2piece, reverse=True)
            bert_data["word_features"].append(word_features[0])
            bert_data["wordpieces_roberta"].append(wordpieces_roberta)
            bert_data["word2piece_scattered_indices"].append(word2piece_scattered_indices)

        return bert_data

Example #22

0

Show file

    def forward(self, src_tokens, src_lengths, prev_output_tokens):
        all_sub_layers = self.args.prob_self_attn or self.args.prob_ed_attn or self.args.prob_ed_norm or self.args.prob_ffn
        prev_output_tokens, target = prev_output_tokens
        self.nmt_model.eval()
        with torch.no_grad():
            encoder_out = self.nmt_model.encoder(src_tokens, src_lengths)
            decoder_out = self.nmt_model.decoder(prev_output_tokens, encoder_out, all_sub_layers=all_sub_layers)

        # prob sub_layers
        if self.args.prob_self_attn:
            nmt_out = decoder_out[1]['self_attn'][self.args.prob_layer-1]
        elif self.args.prob_ed_attn:
            nmt_out = decoder_out[1]['ed_attn'][self.args.prob_layer-1]
        elif self.args.prob_ed_norm:
            nmt_out = decoder_out[1]['ed_norm'][self.args.prob_layer-1]
        elif self.args.prob_ffn:
            nmt_out = decoder_out[1]['ffn'][self.args.prob_layer-1]
        else:
            nmt_out = decoder_out[1]['inner_states'][self.args.prob_layer]

        encoder_padding_mask = prev_output_tokens.eq(1)
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        prob_input = {'encoder_out': nmt_out,
                      'encoder_padding_mask': encoder_padding_mask}
        prob_prev_output_tokens = collate_tokens([s[s.ne(1)] for s in target],1, 2, False, True)
        return self.prober(prob_prev_output_tokens, prob_input)

Example #23

0

Show file

def get_MRC_answer(fname, context, all_qas, ori_qas, options):
    ans = ""
    answers = []
    confidents = []
    docs = []
    weights = []
    for qas in all_qas:
        ts = []
        for qa in qas:
            inp = model.encode(qa, context)
            ts.append(inp)
        batch = collate_tokens(ts, pad_idx=1)
        # print(model.extract_features_aligned_to_words(qa))
        logits, last_attn = model.predict('sentence_classification_head', batch, return_logits=True)
        logits = torch.nn.functional.softmax(logits.squeeze())
        print(last_attn.shape)
        logits = logits.tolist()
        logits = np.asarray(logits).flatten()
        print(logits)
        answer = np.argmax(logits)
        confident = logits[answer]
        print(torch.max(last_attn[answer, 0, :]))
        print(torch.sum(last_attn[answer, 0, :]))
        toks, attns = model.extract_attention_to_words(qas[answer], context, last_attn[answer, 0, :].squeeze())
        attns = attns.tolist()
        ans += chr(ord('A') + answer)
        answers.append(chr(ord('A') + answer))
        confidents.append(confident)
        docs.append(toks)
        weights.append(attns)
    extract_word_file(docs, weights, ori_qas, answers, confidents, fname)
    # response = []
    response = ['答案:' + ans, '{}{}'.format(prefix, fname)]
    return response

Example #24

0

Show file

    def collater(self, samples):
        if len(samples) == 0:
            return {}

        features = [s["features"] for s in samples]
        sizes = [len(s) for s in features]

        target_size = max(sizes)

        collated_features = features[0].new_zeros(len(features), target_size,
                                                  features[0].size(-1))
        padding_mask = torch.BoolTensor(
            collated_features.shape[:-1]).fill_(False)
        for i, (f, size) in enumerate(zip(features, sizes)):
            collated_features[i, :size] = f
            padding_mask[i, size:] = True

        res = {
            "id": torch.LongTensor([s["id"] for s in samples]),
            "net_input": {
                "features": collated_features,
                "padding_mask": padding_mask
            },
        }

        if len(self.labels) > 0:
            target = data_utils.collate_tokens(
                [s["target"] for s in samples],
                pad_idx=self.label_dict.pad(),
                left_pad=False,
            )
            res["target"] = target
        return res

Example #25

0

Show file

    def test_bart_large_mnli(self):
        with contextlib.redirect_stdout(StringIO()):
            # Download BART already finetuned for MNLI
            bart = fb_hub.load('bart.large.mnli')
            bart.eval()  # disable dropout for evaluation

            # Encode a pair of sentences and make a prediction
            tokens = bart.encode('BART is a seq2seq model.',
                                 'BART is not sequence to sequence.')
            prediction = bart.predict('mnli', tokens).argmax().item()
            self.assertEqual(prediction, 0)  # contradiction

            # Encode another pair of sentences
            tokens = bart.encode('BART is denoising autoencoder.',
                                 'BART is version of autoencoder.')
            prediction = bart.predict('mnli', tokens).argmax().item()
            self.assertEqual(prediction, 2)  # entailment

            # Test batched prediction
            from fairseq.data.data_utils import collate_tokens
            batch_of_pairs = [
                [
                    'BART is a seq2seq model.',
                    'BART is not sequence to sequence.'
                ],
                [
                    'BART is denoising autoencoder.',
                    'BART is version of autoencoder.'
                ],
            ]
            batch = collate_tokens(
                [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs],
                pad_idx=1)
            logprobs = bart.predict('mnli', batch)
            self.assertEqual(logprobs.argmax(dim=1).tolist(), [0, 2])

Example #26

0

Show file

def eval_one_example():
    # context = 'I was not very happy. Because he did some bad things to me. But I am fine after he apologized to me.'
    # qa1 = 'What\'s my mood right now? Pleased'
    # qa2 = 'What\'s my mood right now? Sad'
    # qa3 = 'What\'s my mood right now? Angry'
    # qa4 = 'What\'s my mood right now? Cool'

    context = 'The Sunset Pasta Cruise to Emerald Bay Saturday evening, September 25, 2010 You will cruise to Emerald Bay at Sunset, one of the most beautiful places in the world while dining on a Pasta Buffet and listening to live light dance music. Buses will pick up Sunset Pasta Cruise diners from the main entrance to the Horizon Casino Resort at: 4:40pm and 5:05pm on Saturday and take you the 1.5 miles to Ski Run Marina for boarding. Boarding is at Ski Run Marina at 5:15 p.m. (with departure at 5:45 p.m.), located in South Lake Tahoe. The cost for the cruise, pasta buffet, live music, and the 2.5-hour cruise to Emerald Bay is $55 (normally $75). The cost for children between 3-11 is $41 and under 3 is free. Must register the under 3 as well for the coast guard count. The Sunset Pasta Cruise will be limited to 200 guests. Large parties will be seated first to insure seating together. Pick up your Sunset Pasta Cruise tickets at the Expo at the Horizon Casino Resort before 3 p.m. on Saturday. Those unclaimed will be sold to those on the waiting list at that time. At approximately 5:45 pm any extra spaces will be sold to passengers on the dock. Children who require a seat must have a ticket as well. Closest lodging to the Pasta Cruise is: Super 8, Lakeland Village. Please note that our sponsor , the Riva Grill, is on the Lake close to the boarding area for the Tahoe Queen. A great gathering place to meet or to have dinner. Call Riva Grill (530) 542-2600 for lunch or dinner reservations while you are visiting Lake Tahoe.'

    qas = [
        'When will the cruise to Emerald Bay end? At about 7:00 pm.',
        'When will the cruise to Emerald Bay end? At about 8:20 pm.',
        'When will the cruise to Emerald Bay end? At about 9:20 pm.',
        'When will the cruise to Emerald Bay end? On Sunday morning.'
    ]
    t1 = time.time()
    ans = 1
    ts = []
    for qa in qas:
        inp = roberta.encode(qa, context)
        ts.append(inp)
    batch = collate_tokens(ts, pad_idx=1)

    logits = roberta.predict('sentence_classification_head',
                             batch,
                             return_logits=True).tolist()

    logits = np.asarray(logits).flatten()

    print(logits)
    # assert np.argmax(logits) == ans
    t2 = time.time()
    print("Time cost: {}s".format(t2 - t1))

Example #27

0

Show file

 def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
     return data_utils.collate_tokens(
         [s[key] for s in samples],
         pad_idx, eos_idx, left_pad, move_eos_to_beginning,
         pad_to_length=pad_to_length,
         pad_to_multiple=pad_to_multiple,
     )

Example #28

0

Show file

File: core.py Project: tma15/fairseq

 def processTextInput(self, text):
     """Generate source tokens from text input"""
     if self.pre_tokenizer is not None:
         text = self.pre_tokenizer.encode(text)
     if self.bpe_tokenizer is not None:
         text = self.bpe_tokenizer.encode(text)
     target = self.src_dict.encode_line(text,
                                        add_if_not_exist=False,
                                        append_eos=True).long()
     target = fairseq_data_utils.collate_tokens(
         [target],
         self.src_dict.pad(),
         self.src_dict.eos(),
         left_pad=False,
         move_eos_to_beginning=False,
     )
     src_lengths = torch.tensor([target.size(1)], dtype=torch.long)
     prev_output_tokens = None
     sample = {
         "net_input": {
             "src_tokens": target,
             "src_lengths": src_lengths,
             "prev_output_tokens": prev_output_tokens,
         }
     }
     sample = utils.move_to_cuda(sample) if self.use_cuda else sample
     return sample

Example #29

0

Show file

 def collater(self, samples, input_shapes=None):
     return data_utils.collate_tokens(
         samples,
         self.pad_idx,
         left_pad=self.left_pad,
         input_shapes=input_shapes,
     )

Example #30

0

Show file

    def collater(self, samples: List[SpeechToTextJointDatasetItem]) -> Dict:
        s2t_out = super().collater(samples, return_order=True)
        if s2t_out == {}:
            return s2t_out
        net_input, order = s2t_out["net_input"], s2t_out["order"]

        if self.src_texts is not None and self.src_dict is not None:
            src_txt_tokens = fairseq_data_utils.collate_tokens(
                [x.src_txt_tokens for x in samples],
                self.src_dict.pad(),
                self.src_dict.eos(),
                left_pad=False,
                move_eos_to_beginning=False,
            )
            src_txt_tokens = src_txt_tokens.index_select(0, order)
            src_txt_lengths = torch.tensor(
                [x.src_txt_tokens.size()[0] for x in samples], dtype=torch.long
            ).index_select(0, order)
            net_input["src_txt_tokens"] = src_txt_tokens
            net_input["src_txt_lengths"] = src_txt_lengths

        if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None:
            for i in range(len(samples)):
                net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag

        out = {
            "id": s2t_out["id"],
            "net_input": net_input,
            "target": s2t_out["target"],
            "target_lengths": s2t_out["target_lengths"],
            "ntokens": s2t_out["ntokens"],
            "nsentences": len(samples),
        }
        return out