def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue,
                                                results):
        if self.trainer.distributed_backend not in ("ddp_spawn", "ddp_cpu",
                                                    "tpu"):
            return

        # track the best model path
        best_model_path = None
        if self.trainer.checkpoint_callback is not None:
            best_model_path = self.trainer.checkpoint_callback.best_model_path

        if self.trainer.global_rank == 0 and mp_queue is not None:
            rank_zero_warn('cleaning up ddp environment...')
            # todo, pass complete checkpoint as state dictionary
            mp_queue.put(best_model_path)
            mp_queue.put(results)

            # save the last weights
            last_path = None
            if not self.trainer.testing and best_model_path is not None and len(
                    best_model_path) > 0:
                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
                state_dict = move_data_to_device(model.state_dict(),
                                                 torch.device("cpu"))
                atomic_save(state_dict, last_path)
            mp_queue.put(last_path)
 def on_save(self, checkpoint):
     """
     Move XLA tensors to CPU before saving
     Recommended on XLA Guide:
     https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors
     """
     return move_data_to_device(checkpoint, torch.device("cpu"))
Esempio n. 3
0
def get_loader(params: DataParams, encoder: ContentsEncoder):
    tokenizer = AutoTokenizer.from_pretrained(params.pretrained_model_name)
    dateset = get_test_dataset(
        base_dir=params.mind_path,
        tokenizer=tokenizer,
    )

    encoder = encoder.eval()
    inputs = dateset.uniq_news_inputs
    feats = {
        k: encoder.forward(move_data_to_device(
            v, torch.device('cuda'))).squeeze().cpu()
        for k, v in tqdm(inputs.items(), desc='Encoding val candidates')
    }
    dateset.news_feature_map = feats

    loader = DataLoader(
        dateset,
        batch_size=64,
        collate_fn=MINDCollateVal(is_test=True),
        shuffle=False,
        pin_memory=True,
    )

    return loader
Esempio n. 4
0
def test_fn(model, val_loader, save_file_format=None):
    device = model.device
    state = model.training
    model.eval()
    scores, labels, edges = [], [], []
    for batch in val_loader:
        batch = move_data_to_device(batch, device)
        output = model.step(batch)
        label, score = output["label"], output["predict"]
        edge = batch.interaction_pair[:, batch.valid_mask.reshape(-1)]
        scores.append(score.detach().cpu())
        labels.append(label.cpu())
        edges.append(edge.cpu())
    model.train(state)
    scores = torch.cat(scores).numpy()
    labels = torch.cat(labels).numpy()
    edges = torch.cat(edges, dim=1).numpy()
    eval_star_time_stamp = time.time()
    metric = metric_fn.evaluate(predict=scores, label=labels)
    eval_end_time_stamp = time.time()
    logger.info(f"eval time cost: {eval_end_time_stamp-eval_star_time_stamp}")
    if save_file_format is not None:
        save_file = save_file_format.format(aupr=metric["aupr"], auroc=metric["auroc"])
        scio.savemat(save_file, {"row": edges[0],
                      "col": edges[1],
                      "score": scores,
                      "label": labels,
                      })
        logger.info(f"save time cost: {time.time()-eval_end_time_stamp}")
    return scores, labels, edges, metric
Esempio n. 5
0
    def transfer_batch_to_device(self, batch: Any, device: torch.device,
                                 dataloader_idx: int) -> Any:
        """Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors wrapped in a custom
        data structure.

        The data types listed below (and any arbitrary nesting of them) are supported out of the box:

        - :class:`torch.Tensor` or anything that implements `.to(...)`
        - :class:`list`
        - :class:`dict`
        - :class:`tuple`
        - :class:`torchtext.data.batch.Batch`

        For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

        Note:
            This hook should only transfer the data and not modify it, nor should it move the data to
            any other device than the one passed in as argument (unless you know what you are doing).
            To check the current state of execution of this hook you can use
            ``self.trainer.training/testing/validating/predicting`` so that you can
            add different logic as per your requirement.

        Note:
            This hook only runs on single GPU training and DDP (no data-parallel).
            Data-Parallel support will come in near future.

        Args:
            batch: A batch of data that needs to be transferred to a new device.
            device: The target device as defined in PyTorch.
            dataloader_idx: The index of the dataloader to which the batch belongs.

        Returns:
            A reference to the data on the new device.

        Example::

            def transfer_batch_to_device(self, batch, device, dataloader_idx):
                if isinstance(batch, CustomBatch):
                    # move all tensors in your custom data structure to the device
                    batch.samples = batch.samples.to(device)
                    batch.targets = batch.targets.to(device)
                elif dataloader_idx == 0:
                    # skip device transfer for the first dataloader or anything you wish
                    pass
                else:
                    batch = super().transfer_batch_to_device(data, device, dataloader_idx)
                return batch

        Raises:
            MisconfigurationException:
                If using data-parallel, ``Trainer(strategy='dp')``.

        See Also:
            - :meth:`move_data_to_device`
            - :meth:`apply_to_collection`
        """
        return move_data_to_device(batch, device)
Esempio n. 6
0
    def transfer_batch_to_device(self,
                                 batch: Any,
                                 device: Optional[torch.device] = None) -> Any:
        """
        Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors
        wrapped in a custom data structure.

        The data types listed below (and any arbitrary nesting of them) are supported out of the box:

        - :class:`torch.Tensor` or anything that implements `.to(...)`
        - :class:`list`
        - :class:`dict`
        - :class:`tuple`
        - :class:`torchtext.data.batch.Batch`

        For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

        Note:
            This hook should only transfer the data and not modify it, nor should it move the data to
            any other device than the one passed in as argument (unless you know what you are doing).

        Note:
            This hook only runs on single GPU training and DDP (no data-parallel).
            Data-Parallel support will come in near future.

        Args:
            batch: A batch of data that needs to be transferred to a new device.
            device: The target device as defined in PyTorch.

        Returns:
            A reference to the data on the new device.

        Example::

            def transfer_batch_to_device(self, batch, device):
                if isinstance(batch, CustomBatch):
                    # move all tensors in your custom data structure to the device
                    batch.samples = batch.samples.to(device)
                    batch.targets = batch.targets.to(device)
                else:
                    batch = super().transfer_batch_to_device(data, device)
                return batch

        Raises:
            MisconfigurationException:
                If using data-parallel, ``Trainer(accelerator='dp')``.

        See Also:
            - :meth:`move_data_to_device`
            - :meth:`apply_to_collection`
        """
        device = device or self.device
        return move_data_to_device(batch, device)
Esempio n. 7
0
    def transfer_batch_to_device(self,
                                 batch: Any,
                                 device: Optional[torch.device] = None) -> Any:
        """
        Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors
        wrapped in a custom data structure.

        The data types listed below (and any arbitrary nesting of them) are supported out of the box:

        - :class:`torch.Tensor` or anything that implements `.to(...)`
        - :class:`list`
        - :class:`dict`
        - :class:`tuple`
        - :class:`torchtext.data.batch.Batch`

        For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

        Note:
            This hook should only transfer the data and not modify it, nor should it move the data to
            any other device than the one passed in as argument (unless you know what you are doing).

        Note:
            This hook only runs on single GPU training and DDP (no data-parallel).
            If you need multi-GPU support for your custom batch objects, you need to define your custom
            :class:`~torch.nn.parallel.DistributedDataParallel` or
            :class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel` and
            override :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_ddp`.

        Args:
            batch: A batch of data that needs to be transferred to a new device.
            device: The target device as defined in PyTorch.

        Returns:
            A reference to the data on the new device.

        Example::

            def transfer_batch_to_device(self, batch, device):
                if isinstance(batch, CustomBatch):
                    # move all tensors in your custom data structure to the device
                    batch.samples = batch.samples.to(device)
                    batch.targets = batch.targets.to(device)
                else:
                    batch = super().transfer_batch_to_device(data, device)
                return batch

        See Also:
            - :meth:`move_data_to_device`
            - :meth:`apply_to_collection`
        """
        device = device or self.device
        return move_data_to_device(batch, device)
Esempio n. 8
0
 def transfer_batch_to_device(self, batch, device):
     # DGLGraph's .to method as of 0.4.3.post2 doesn't take the non_blocking arg
     # which pytorch-lightning passes. So we need to customize this behavior
     assert isinstance(batch, list)
     return [
         (
             [g.to(device) for g in e]
             if isinstance(e, list) and isinstance(e[0], DGLGraph)
             else move_data_to_device(e, device)
         )
         if e is not None else None
         for e in batch
     ]
Esempio n. 9
0
    def on_validation_epoch_start(self):
        # Pre compute feature of uniq candidates in val to save time.
        val_dataset = cast(MINDDatasetVal, self.val_dataloader().dataset)

        if self.total_processed == 0:
            val_dataset.init_dummy_feature_map(self.model.encoder.dim)
            return

        encoder = self.model.encoder.eval()
        inputs = val_dataset.uniq_news_inputs
        feats = {
            k: encoder.forward(move_data_to_device(v, self.device)).squeeze().cpu()
            for k, v in tqdm(inputs.items(), desc='Encoding val candidates')
        }
        val_dataset.news_feature_map = feats
Esempio n. 10
0
    def transfer_batch_to_device(self, batch: Any,
                                 device: torch.device) -> Any:
        """
        Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors
        wrapped in a custom data structure.

        The data types listed below (and any arbitrary nesting of them) are supported out of the box:

        - :class:`torch.Tensor`
        - :class:`list`
        - :class:`dict`
        - :class:`tuple`
        - ``torchtext.data.Batch`` (COMING SOON)

        For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

        Example::

            def transfer_batch_to_device(self, batch, device)
                if isinstance(batch, CustomBatch):
                    # move all tensors in your custom data structure to the device
                    batch.samples = batch.samples.to(device)
                    batch.targets = batch.targets.to(device)
                else:
                    batch = super().transfer_batch_to_device(data, device)
                return batch

        Args:
            batch: A batch of data that needs to be transferred to a new device.
            device: The target device as defined in PyTorch.

        Returns:
            A reference to the data on the new device.

        Note:
            This hook should only transfer the data and not modify it, nor should it move the data to
            any other device than the one passed in as argument (unless you know what you are doing).
            The :class:`~pytorch_lightning.trainer.trainer.Trainer` already takes care of splitting the
            batch and determines the target devices.

        See Also:
            - :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device`
            - :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection`
        """
        return move_data_to_device(batch, device)
def transfer_batch_to_device(batch: Any, device: torch.device) -> Any:
    """
    For sequence models, transfer the nested lists of items to the given GPU device.
    For all other models, this relies on Lightning's default code to move the batch of data to the GPU.
    :param batch: A batch of data coming from the dataloader.
    :param device: The target CUDA device.
    :return: A modified batch of data, where all tensor now live on the given CUDA device.
    """
    if not isinstance(batch, dict):
        raise ValueError(f"This function expects a dictionary input, but got: {type(batch)}")
    # For sequence models, this method receives a dictionary with "item": List[List[ScalarItem]]
    items = batch.get("items", None)
    if items is not None and isinstance(items, List) and isinstance(items[0], List) and \
            isinstance(items[0][0], ScalarItem):
        batch["items"] = [[j.to_device(device) for j in i] for i in items]
        return batch
    else:
        return move_data_to_device(batch, device)
Esempio n. 12
0
    def to_device(self, obj: Union[nn.Module, Tensor, Any]) -> Union[nn.Module, Tensor, Any]:
        """Move a :class:`torch.nn.Module` or a collection of tensors to the current device, if it is not already
        on that device.

        Args:
            obj: An object to move to the device. Can be an instance of :class:`torch.nn.Module`, a tensor, or a
                 (nested) collection of tensors (e.g., a dictionary).

        Returns:
            A reference to the object that was moved to the new device.
        """
        if isinstance(obj, nn.Module):
            if self.device.type == "cuda":
                # need to call this manually here again in case we spawned with DDPSpawnStrategy
                # TODO: refactor to let plugin handle this cleanly
                torch.cuda.set_device(self.device)
            return obj.to(self.device)
        return move_data_to_device(obj, device=self.device)
    def _get_input_array_copy(self, input_array=None) -> Any:
        """
        Returns a deep copy of the example input array in cases where it is expected that the
        input changes during the verification process.
        Arguments:
            input_array: The input to clone.
        """
        if input_array is None and isinstance(self.model, LightningModule):
            input_array = self.model.example_input_array
        input_array = deepcopy(input_array)

        if isinstance(self.model, LightningModule):
            input_array = self.model.transfer_batch_to_device(
                input_array, self.model.device)
        else:
            input_array = move_data_to_device(
                input_array, device=next(self.model.parameters()).device)

        return input_array
Esempio n. 14
0
def main():
    ckpt_path = '/mnt/ssdnfs/vfa-ruby/akirasosa/experiments/011_popularity/1614587740/checkpoints/last.ckpt'
    params = Params.load('./params/popularity/001.yaml')

    model = load_model(ckpt_path)
    loader = get_loader(params.data_params)

    preds = []
    for batch in tqdm(loader):
        batch = move_data_to_device(batch, model.device)
        out = model.forward(**batch)
        out = out.logits.cpu().numpy().reshape(-1)
        preds.append(out)
    preds = np.concatenate(preds)

    out_dir = Path('../tmp')
    out_dir.mkdir(exist_ok=True)

    df_sub = make_popularity_sub(preds)
    df_sub.to_parquet(out_dir / 'sub_popularity.pqt')
Esempio n. 15
0
def main():
    ckpt_path = '/mnt/ssdnfs/vfa-ruby/akirasosa/experiments/010_mind_nrms/1612799645/checkpoints/epoch=1-step=328001.ckpt'
    params = Params.load('./params/main/002.yaml')

    model = load_model(ckpt_path, params.module_params)
    loader = get_loader(params.data_params, model.encoder)

    preds = []
    for batch in tqdm(loader):
        batch = move_data_to_device(batch, torch.device('cuda'))
        logits = model.forward(batch)
        logits = logits.cpu().numpy().reshape(-1)
        preds.append(logits)
    preds = np.concatenate(preds)

    out_dir = Path('../tmp')
    out_dir.mkdir(exist_ok=True)

    df_sub = make_main_sub(preds)
    df_sub.to_parquet(out_dir / 'sub_nrms.pqt')
Esempio n. 16
0
    def _get_input_array_copy(self, input_array: Optional[Any] = None) -> Any:
        """Returns a deep copy of the example input array in cases where it is expected that the input changes
        during the verification process.

        Arguments:
            input_array: The input to clone.
        """
        if input_array is None and isinstance(self.model, LightningModule):
            input_array = self.model.example_input_array
        input_array = deepcopy(input_array)

        if isinstance(self.model, LightningModule):
            kwargs = {}
            if is_param_in_hook_signature(self.model.transfer_batch_to_device,
                                          "dataloader_idx"):
                # Requires for Lightning 1.4 and above
                kwargs["dataloader_idx"] = 0
            input_array = self.model.transfer_batch_to_device(
                input_array, self.model.device, **kwargs)
        else:
            input_array = move_data_to_device(
                input_array, device=next(self.model.parameters()).device)

        return input_array
Esempio n. 17
0
        test_dataset_registry[params.test_set](params.data_root))

print("Evaluating {} on {}".format(checkpoint_net.get_name(),
                                   checkpoint_net.hparams.test_set))

# TODO: load device from params
if checkpoint_net.hparams.n_eval_samples > 0:
    print("Number of samples: {}".format(
        checkpoint_net.hparams.n_eval_samples))
    trainer = Trainer(gpus=[0],
                      limit_test_batches=checkpoint_net.hparams.n_eval_samples)
else:
    print("Number of samples: {}".format(len(checkpoint_net.test_set)))
    trainer = Trainer(gpus=[0], limit_test_batches=1.0)

results = move_data_to_device(
    trainer.test(checkpoint_net)[0], torch.device("cpu"))


def save_results(result_dict: dict, output_dir, test_set, n_eval_samples,
                 name):
    if n_eval_samples < 1:
        n_eval_samples = "all"
    else:
        n_eval_samples = str(n_eval_samples)

    output_subdir = os.path.join(output_dir, test_set, n_eval_samples)
    os.makedirs(output_subdir, exist_ok=True)

    torch.save(result_dict, os.path.join(output_subdir, name + ".pt"))

Esempio n. 18
0
 def __transfer_batch_to_device(self, batch: Any, device: torch.device):
     model = self.get_model()
     if model is not None:
         return model.transfer_batch_to_device(batch, device)
     return move_data_to_device(batch, device)
Esempio n. 19
0
 def transfer_batch_to_device(self, batch, device):
     return move_data_to_device(batch, device)