def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.trainer.distributed_backend not in ("ddp_spawn", "ddp_cpu", "tpu"): return # track the best model path best_model_path = None if self.trainer.checkpoint_callback is not None: best_model_path = self.trainer.checkpoint_callback.best_model_path if self.trainer.global_rank == 0 and mp_queue is not None: rank_zero_warn('cleaning up ddp environment...') # todo, pass complete checkpoint as state dictionary mp_queue.put(best_model_path) mp_queue.put(results) # save the last weights last_path = None if not self.trainer.testing and best_model_path is not None and len( best_model_path) > 0: last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) state_dict = move_data_to_device(model.state_dict(), torch.device("cpu")) atomic_save(state_dict, last_path) mp_queue.put(last_path)
def on_save(self, checkpoint): """ Move XLA tensors to CPU before saving Recommended on XLA Guide: https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors """ return move_data_to_device(checkpoint, torch.device("cpu"))
def get_loader(params: DataParams, encoder: ContentsEncoder): tokenizer = AutoTokenizer.from_pretrained(params.pretrained_model_name) dateset = get_test_dataset( base_dir=params.mind_path, tokenizer=tokenizer, ) encoder = encoder.eval() inputs = dateset.uniq_news_inputs feats = { k: encoder.forward(move_data_to_device( v, torch.device('cuda'))).squeeze().cpu() for k, v in tqdm(inputs.items(), desc='Encoding val candidates') } dateset.news_feature_map = feats loader = DataLoader( dateset, batch_size=64, collate_fn=MINDCollateVal(is_test=True), shuffle=False, pin_memory=True, ) return loader
def test_fn(model, val_loader, save_file_format=None): device = model.device state = model.training model.eval() scores, labels, edges = [], [], [] for batch in val_loader: batch = move_data_to_device(batch, device) output = model.step(batch) label, score = output["label"], output["predict"] edge = batch.interaction_pair[:, batch.valid_mask.reshape(-1)] scores.append(score.detach().cpu()) labels.append(label.cpu()) edges.append(edge.cpu()) model.train(state) scores = torch.cat(scores).numpy() labels = torch.cat(labels).numpy() edges = torch.cat(edges, dim=1).numpy() eval_star_time_stamp = time.time() metric = metric_fn.evaluate(predict=scores, label=labels) eval_end_time_stamp = time.time() logger.info(f"eval time cost: {eval_end_time_stamp-eval_star_time_stamp}") if save_file_format is not None: save_file = save_file_format.format(aupr=metric["aupr"], auroc=metric["auroc"]) scio.savemat(save_file, {"row": edges[0], "col": edges[1], "score": scores, "label": labels, }) logger.info(f"save time cost: {time.time()-eval_end_time_stamp}") return scores, labels, edges, metric
def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: """Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors wrapped in a custom data structure. The data types listed below (and any arbitrary nesting of them) are supported out of the box: - :class:`torch.Tensor` or anything that implements `.to(...)` - :class:`list` - :class:`dict` - :class:`tuple` - :class:`torchtext.data.batch.Batch` For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). Note: This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). To check the current state of execution of this hook you can use ``self.trainer.training/testing/validating/predicting`` so that you can add different logic as per your requirement. Note: This hook only runs on single GPU training and DDP (no data-parallel). Data-Parallel support will come in near future. Args: batch: A batch of data that needs to be transferred to a new device. device: The target device as defined in PyTorch. dataloader_idx: The index of the dataloader to which the batch belongs. Returns: A reference to the data on the new device. Example:: def transfer_batch_to_device(self, batch, device, dataloader_idx): if isinstance(batch, CustomBatch): # move all tensors in your custom data structure to the device batch.samples = batch.samples.to(device) batch.targets = batch.targets.to(device) elif dataloader_idx == 0: # skip device transfer for the first dataloader or anything you wish pass else: batch = super().transfer_batch_to_device(data, device, dataloader_idx) return batch Raises: MisconfigurationException: If using data-parallel, ``Trainer(strategy='dp')``. See Also: - :meth:`move_data_to_device` - :meth:`apply_to_collection` """ return move_data_to_device(batch, device)
def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: """ Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors wrapped in a custom data structure. The data types listed below (and any arbitrary nesting of them) are supported out of the box: - :class:`torch.Tensor` or anything that implements `.to(...)` - :class:`list` - :class:`dict` - :class:`tuple` - :class:`torchtext.data.batch.Batch` For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). Note: This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). Note: This hook only runs on single GPU training and DDP (no data-parallel). Data-Parallel support will come in near future. Args: batch: A batch of data that needs to be transferred to a new device. device: The target device as defined in PyTorch. Returns: A reference to the data on the new device. Example:: def transfer_batch_to_device(self, batch, device): if isinstance(batch, CustomBatch): # move all tensors in your custom data structure to the device batch.samples = batch.samples.to(device) batch.targets = batch.targets.to(device) else: batch = super().transfer_batch_to_device(data, device) return batch Raises: MisconfigurationException: If using data-parallel, ``Trainer(accelerator='dp')``. See Also: - :meth:`move_data_to_device` - :meth:`apply_to_collection` """ device = device or self.device return move_data_to_device(batch, device)
def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: """ Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors wrapped in a custom data structure. The data types listed below (and any arbitrary nesting of them) are supported out of the box: - :class:`torch.Tensor` or anything that implements `.to(...)` - :class:`list` - :class:`dict` - :class:`tuple` - :class:`torchtext.data.batch.Batch` For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). Note: This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). Note: This hook only runs on single GPU training and DDP (no data-parallel). If you need multi-GPU support for your custom batch objects, you need to define your custom :class:`~torch.nn.parallel.DistributedDataParallel` or :class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel` and override :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_ddp`. Args: batch: A batch of data that needs to be transferred to a new device. device: The target device as defined in PyTorch. Returns: A reference to the data on the new device. Example:: def transfer_batch_to_device(self, batch, device): if isinstance(batch, CustomBatch): # move all tensors in your custom data structure to the device batch.samples = batch.samples.to(device) batch.targets = batch.targets.to(device) else: batch = super().transfer_batch_to_device(data, device) return batch See Also: - :meth:`move_data_to_device` - :meth:`apply_to_collection` """ device = device or self.device return move_data_to_device(batch, device)
def transfer_batch_to_device(self, batch, device): # DGLGraph's .to method as of 0.4.3.post2 doesn't take the non_blocking arg # which pytorch-lightning passes. So we need to customize this behavior assert isinstance(batch, list) return [ ( [g.to(device) for g in e] if isinstance(e, list) and isinstance(e[0], DGLGraph) else move_data_to_device(e, device) ) if e is not None else None for e in batch ]
def on_validation_epoch_start(self): # Pre compute feature of uniq candidates in val to save time. val_dataset = cast(MINDDatasetVal, self.val_dataloader().dataset) if self.total_processed == 0: val_dataset.init_dummy_feature_map(self.model.encoder.dim) return encoder = self.model.encoder.eval() inputs = val_dataset.uniq_news_inputs feats = { k: encoder.forward(move_data_to_device(v, self.device)).squeeze().cpu() for k, v in tqdm(inputs.items(), desc='Encoding val candidates') } val_dataset.news_feature_map = feats
def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any: """ Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors wrapped in a custom data structure. The data types listed below (and any arbitrary nesting of them) are supported out of the box: - :class:`torch.Tensor` - :class:`list` - :class:`dict` - :class:`tuple` - ``torchtext.data.Batch`` (COMING SOON) For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). Example:: def transfer_batch_to_device(self, batch, device) if isinstance(batch, CustomBatch): # move all tensors in your custom data structure to the device batch.samples = batch.samples.to(device) batch.targets = batch.targets.to(device) else: batch = super().transfer_batch_to_device(data, device) return batch Args: batch: A batch of data that needs to be transferred to a new device. device: The target device as defined in PyTorch. Returns: A reference to the data on the new device. Note: This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). The :class:`~pytorch_lightning.trainer.trainer.Trainer` already takes care of splitting the batch and determines the target devices. See Also: - :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device` - :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection` """ return move_data_to_device(batch, device)
def transfer_batch_to_device(batch: Any, device: torch.device) -> Any: """ For sequence models, transfer the nested lists of items to the given GPU device. For all other models, this relies on Lightning's default code to move the batch of data to the GPU. :param batch: A batch of data coming from the dataloader. :param device: The target CUDA device. :return: A modified batch of data, where all tensor now live on the given CUDA device. """ if not isinstance(batch, dict): raise ValueError(f"This function expects a dictionary input, but got: {type(batch)}") # For sequence models, this method receives a dictionary with "item": List[List[ScalarItem]] items = batch.get("items", None) if items is not None and isinstance(items, List) and isinstance(items[0], List) and \ isinstance(items[0][0], ScalarItem): batch["items"] = [[j.to_device(device) for j in i] for i in items] return batch else: return move_data_to_device(batch, device)
def to_device(self, obj: Union[nn.Module, Tensor, Any]) -> Union[nn.Module, Tensor, Any]: """Move a :class:`torch.nn.Module` or a collection of tensors to the current device, if it is not already on that device. Args: obj: An object to move to the device. Can be an instance of :class:`torch.nn.Module`, a tensor, or a (nested) collection of tensors (e.g., a dictionary). Returns: A reference to the object that was moved to the new device. """ if isinstance(obj, nn.Module): if self.device.type == "cuda": # need to call this manually here again in case we spawned with DDPSpawnStrategy # TODO: refactor to let plugin handle this cleanly torch.cuda.set_device(self.device) return obj.to(self.device) return move_data_to_device(obj, device=self.device)
def _get_input_array_copy(self, input_array=None) -> Any: """ Returns a deep copy of the example input array in cases where it is expected that the input changes during the verification process. Arguments: input_array: The input to clone. """ if input_array is None and isinstance(self.model, LightningModule): input_array = self.model.example_input_array input_array = deepcopy(input_array) if isinstance(self.model, LightningModule): input_array = self.model.transfer_batch_to_device( input_array, self.model.device) else: input_array = move_data_to_device( input_array, device=next(self.model.parameters()).device) return input_array
def main(): ckpt_path = '/mnt/ssdnfs/vfa-ruby/akirasosa/experiments/011_popularity/1614587740/checkpoints/last.ckpt' params = Params.load('./params/popularity/001.yaml') model = load_model(ckpt_path) loader = get_loader(params.data_params) preds = [] for batch in tqdm(loader): batch = move_data_to_device(batch, model.device) out = model.forward(**batch) out = out.logits.cpu().numpy().reshape(-1) preds.append(out) preds = np.concatenate(preds) out_dir = Path('../tmp') out_dir.mkdir(exist_ok=True) df_sub = make_popularity_sub(preds) df_sub.to_parquet(out_dir / 'sub_popularity.pqt')
def main(): ckpt_path = '/mnt/ssdnfs/vfa-ruby/akirasosa/experiments/010_mind_nrms/1612799645/checkpoints/epoch=1-step=328001.ckpt' params = Params.load('./params/main/002.yaml') model = load_model(ckpt_path, params.module_params) loader = get_loader(params.data_params, model.encoder) preds = [] for batch in tqdm(loader): batch = move_data_to_device(batch, torch.device('cuda')) logits = model.forward(batch) logits = logits.cpu().numpy().reshape(-1) preds.append(logits) preds = np.concatenate(preds) out_dir = Path('../tmp') out_dir.mkdir(exist_ok=True) df_sub = make_main_sub(preds) df_sub.to_parquet(out_dir / 'sub_nrms.pqt')
def _get_input_array_copy(self, input_array: Optional[Any] = None) -> Any: """Returns a deep copy of the example input array in cases where it is expected that the input changes during the verification process. Arguments: input_array: The input to clone. """ if input_array is None and isinstance(self.model, LightningModule): input_array = self.model.example_input_array input_array = deepcopy(input_array) if isinstance(self.model, LightningModule): kwargs = {} if is_param_in_hook_signature(self.model.transfer_batch_to_device, "dataloader_idx"): # Requires for Lightning 1.4 and above kwargs["dataloader_idx"] = 0 input_array = self.model.transfer_batch_to_device( input_array, self.model.device, **kwargs) else: input_array = move_data_to_device( input_array, device=next(self.model.parameters()).device) return input_array
test_dataset_registry[params.test_set](params.data_root)) print("Evaluating {} on {}".format(checkpoint_net.get_name(), checkpoint_net.hparams.test_set)) # TODO: load device from params if checkpoint_net.hparams.n_eval_samples > 0: print("Number of samples: {}".format( checkpoint_net.hparams.n_eval_samples)) trainer = Trainer(gpus=[0], limit_test_batches=checkpoint_net.hparams.n_eval_samples) else: print("Number of samples: {}".format(len(checkpoint_net.test_set))) trainer = Trainer(gpus=[0], limit_test_batches=1.0) results = move_data_to_device( trainer.test(checkpoint_net)[0], torch.device("cpu")) def save_results(result_dict: dict, output_dir, test_set, n_eval_samples, name): if n_eval_samples < 1: n_eval_samples = "all" else: n_eval_samples = str(n_eval_samples) output_subdir = os.path.join(output_dir, test_set, n_eval_samples) os.makedirs(output_subdir, exist_ok=True) torch.save(result_dict, os.path.join(output_subdir, name + ".pt"))
def __transfer_batch_to_device(self, batch: Any, device: torch.device): model = self.get_model() if model is not None: return model.transfer_batch_to_device(batch, device) return move_data_to_device(batch, device)
def transfer_batch_to_device(self, batch, device): return move_data_to_device(batch, device)