def load_task(self): print("### Loading dataset: {}".format(self.config["task"]["dataset"])) self.parallel_collater = ParallelCollater(1) if self.config["task"]["dataset"] == "single_point_lmdb": self.train_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["dataset"]) self.train_sampler = DistributedSampler( self.train_dataset, num_replicas=distutils.get_world_size(), rank=distutils.get_rank(), shuffle=True, ) self.train_loader = DataLoader( self.train_dataset, batch_size=self.config["optim"]["batch_size"], collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], pin_memory=True, sampler=self.train_sampler, ) self.val_loader = self.test_loader = None self.val_sampler = None if "val_dataset" in self.config: self.val_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["val_dataset"]) self.val_sampler = DistributedSampler( self.val_dataset, num_replicas=distutils.get_world_size(), rank=distutils.get_rank(), shuffle=False, ) self.val_loader = DataLoader( self.val_dataset, self.config["optim"].get("eval_batch_size", 64), collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], pin_memory=True, sampler=self.val_sampler, ) else: raise NotImplementedError self.num_targets = 1 # Normalizer for the dataset. # Compute mean, std of training set labels. self.normalizers = {} if self.config["dataset"].get("normalize_labels", True): if "target_mean" in self.config["dataset"]: self.normalizers["target"] = Normalizer( mean=self.config["dataset"]["target_mean"], std=self.config["dataset"]["target_std"], device=self.device, ) else: raise NotImplementedError
def predict(self, dataset, batch_size=32): if isinstance(dataset, dict): if self.config["task"]["dataset"] == "trajectory_lmdb": print( "### Generating predictions on {}.".format(dataset["src"]) ) else: print( "### Generating predictions on {}.".format( dataset["src"] + dataset["traj"] ) ) dataset = registry.get_dataset_class( self.config["task"]["dataset"] )(dataset) data_loader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=self.parallel_collater, ) elif isinstance(dataset, torch_geometric.data.Batch): data_loader = [[dataset]] else: raise NotImplementedError self.model.eval() predictions = {"energy": [], "forces": []} for i, batch_list in enumerate(data_loader): out = self._forward(batch_list) if self.normalizers is not None and "target" in self.normalizers: out["energy"] = self.normalizers["target"].denorm( out["energy"] ) out["forces"] = self.normalizers["grad_target"].denorm( out["forces"] ) atoms_sum = 0 predictions["energy"].extend(out["energy"].tolist()) batch_natoms = torch.cat([batch.natoms for batch in batch_list]) for natoms in batch_natoms: predictions["forces"].append( out["forces"][atoms_sum : natoms + atoms_sum] .cpu() .detach() .numpy() ) atoms_sum += natoms return predictions
def predict(self, src, batch_size=32): print("### Generating predictions on {}.".format(src)) dataset_config = {"src": src} dataset = registry.get_dataset_class( self.config["task"]["dataset"])(dataset_config) data_loader = dataset.get_full_dataloader(batch_size=batch_size) self.model.eval() predictions = [] for i, batch in enumerate(data_loader): batch.to(self.device) out, metrics = self._forward(batch) if self.config["dataset"].get("normalize_labels", True): out["output"] = self.normalizers["target"].denorm( out["output"]) predictions.extend(out["output"].tolist()) return predictions
def predict(self, src, batch_size=32): print("### Generating predictions on {}.".format(src)) # Parse the data dataset_config = {"src": src} dataset = registry.get_dataset_class( self.conv_trainer.config["task"]["dataset"])(dataset_config) data_loader = dataset.get_full_dataloader(batch_size=batch_size) # Get the convolutions convs, targets_actual = self._get_convolutions(data_loader) try: normed_convs = self.conv_normalizer.norm(convs) except AttributeError as error: raise type(error)(str(error) + "; error may have occurred " "because the CFGP may not have been trained yet") # Feed the convolutions into the GP targets_pred, targets_std = self.gpytorch_trainer.predict(normed_convs) return targets_pred, targets_std
def load_task(self): assert (self.config["task"]["dataset"] == "single_point_lmdb" ), "EnergyTrainer requires single_point_lmdb dataset" logging.info(f"Loading dataset: {self.config['task']['dataset']}") self.parallel_collater = ParallelCollater( 0 if self.cpu else 1, self.config["model_attributes"].get("otf_graph", False), ) self.val_loader = self.test_loader = self.train_loader = None self.val_sampler = self.test_sampler = self.train_sampler = None if self.config.get("dataset", None): self.train_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["dataset"]) self.train_sampler = self.get_sampler( self.train_dataset, self.config["optim"]["batch_size"], shuffle=True, ) self.train_loader = self.get_dataloader( self.train_dataset, self.train_sampler, ) if self.config.get("val_dataset", None): self.val_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["val_dataset"]) self.val_sampler = self.get_sampler( self.val_dataset, self.config["optim"].get("eval_batch_size", self.config["optim"]["batch_size"]), shuffle=False, ) self.val_loader = self.get_dataloader( self.val_dataset, self.val_sampler, ) if self.config.get("test_dataset", None): self.test_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["test_dataset"]) self.test_sampler = self.get_sampler( self.test_dataset, self.config["optim"].get("eval_batch_size", self.config["optim"]["batch_size"]), shuffle=False, ) self.test_loader = self.get_dataloader( self.test_dataset, self.test_sampler, ) self.num_targets = 1 # Normalizer for the dataset. # Compute mean, std of training set labels. self.normalizers = {} if self.normalizer.get("normalize_labels", False): if "target_mean" in self.normalizer: self.normalizers["target"] = Normalizer( mean=self.normalizer["target_mean"], std=self.normalizer["target_std"], device=self.device, ) else: raise NotImplementedError
def load_task(self): print("### Loading dataset: {}".format(self.config["task"]["dataset"])) self.parallel_collater = ParallelCollater(self.config["optim"].get( "num_gpus", 1)) if self.config["task"]["dataset"] == "trajectory_lmdb": self.train_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["dataset"]) self.train_loader = DataLoader( self.train_dataset, batch_size=self.config["optim"]["batch_size"], shuffle=True, collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], ) self.val_loader = self.test_loader = None if "val_dataset" in self.config: self.val_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["val_dataset"]) self.val_loader = DataLoader( self.val_dataset, self.config["optim"].get("eval_batch_size", 64), shuffle=False, collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], ) else: self.dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["dataset"]) ( self.train_loader, self.val_loader, self.test_loader, ) = self.dataset.get_dataloaders( batch_size=self.config["optim"]["batch_size"], collate_fn=self.parallel_collater, ) self.num_targets = 1 # Normalizer for the dataset. # Compute mean, std of training set labels. self.normalizers = {} if self.config["dataset"].get("normalize_labels", True): if "target_mean" in self.config["dataset"]: self.normalizers["target"] = Normalizer( mean=self.config["dataset"]["target_mean"], std=self.config["dataset"]["target_std"], device=self.device, ) else: self.normalizers["target"] = Normalizer( tensor=self.train_loader.dataset.data.y[ self.train_loader.dataset.__indices__], device=self.device, ) # If we're computing gradients wrt input, set mean of normalizer to 0 -- # since it is lost when compute dy / dx -- and std to forward target std if "grad_input" in self.config["task"]: if self.config["dataset"].get("normalize_labels", True): if "target_mean" in self.config["dataset"]: self.normalizers["grad_target"] = Normalizer( mean=self.config["dataset"]["grad_target_mean"], std=self.config["dataset"]["grad_target_std"], device=self.device, ) else: self.normalizers["grad_target"] = Normalizer( tensor=self.train_loader.dataset.data.y[ self.train_loader.dataset.__indices__], device=self.device, ) self.normalizers["grad_target"].mean.fill_(0) if self.is_vis and self.config["task"]["dataset"] != "qm9": # Plot label distribution. plots = [ plot_histogram( self.train_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: train", ), plot_histogram( self.val_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: val", ), plot_histogram( self.test_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: test", ), ] self.logger.log_plots(plots)
def load_task(self): print("### Loading dataset: {}".format(self.config["task"]["dataset"])) self.parallel_collater = ParallelCollater( 1 if not self.cpu else 0, self.config["model_attributes"].get("otf_graph", False), ) if self.config["task"]["dataset"] == "trajectory_lmdb": self.train_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["dataset"]) self.train_loader = DataLoader( self.train_dataset, batch_size=self.config["optim"]["batch_size"], shuffle=True, collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], pin_memory=True, ) self.val_loader = self.test_loader = None if "val_dataset" in self.config: self.val_dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["val_dataset"]) self.val_loader = DataLoader( self.val_dataset, self.config["optim"].get("eval_batch_size", 64), shuffle=False, collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], pin_memory=True, ) if "test_dataset" in self.config: self.test_dataset = registry.get_dataset_class( self.config["task"]["dataset"])( self.config["test_dataset"]) self.test_loader = DataLoader( self.test_dataset, self.config["optim"].get("eval_batch_size", 64), shuffle=False, collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], pin_memory=True, ) if "relax_dataset" in self.config["task"]: assert os.path.isfile( self.config["task"]["relax_dataset"]["src"]) self.relax_dataset = registry.get_dataset_class( "single_point_lmdb")(self.config["task"]["relax_dataset"]) self.relax_sampler = DistributedSampler( self.relax_dataset, num_replicas=distutils.get_world_size(), rank=distutils.get_rank(), shuffle=False, ) self.relax_loader = DataLoader( self.relax_dataset, batch_size=self.config["optim"].get("eval_batch_size", 64), collate_fn=self.parallel_collater, num_workers=self.config["optim"]["num_workers"], pin_memory=True, sampler=self.relax_sampler, ) else: self.dataset = registry.get_dataset_class( self.config["task"]["dataset"])(self.config["dataset"]) ( self.train_loader, self.val_loader, self.test_loader, ) = self.dataset.get_dataloaders( batch_size=self.config["optim"]["batch_size"], collate_fn=self.parallel_collater, ) self.num_targets = 1 # Normalizer for the dataset. # Compute mean, std of training set labels. self.normalizers = {} if self.config["dataset"].get("normalize_labels", False): if "target_mean" in self.config["dataset"]: self.normalizers["target"] = Normalizer( mean=self.config["dataset"]["target_mean"], std=self.config["dataset"]["target_std"], device=self.device, ) else: self.normalizers["target"] = Normalizer( tensor=self.train_loader.dataset.data.y[ self.train_loader.dataset.__indices__], device=self.device, ) # If we're computing gradients wrt input, set mean of normalizer to 0 -- # since it is lost when compute dy / dx -- and std to forward target std if self.config["model_attributes"].get("regress_forces", True): if self.config["dataset"].get("normalize_labels", False): if "grad_target_mean" in self.config["dataset"]: self.normalizers["grad_target"] = Normalizer( mean=self.config["dataset"]["grad_target_mean"], std=self.config["dataset"]["grad_target_std"], device=self.device, ) else: self.normalizers["grad_target"] = Normalizer( tensor=self.train_loader.dataset.data.y[ self.train_loader.dataset.__indices__], device=self.device, ) self.normalizers["grad_target"].mean.fill_(0) if (self.is_vis and self.config["task"]["dataset"] != "qm9" and distutils.is_master()): # Plot label distribution. plots = [ plot_histogram( self.train_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: train", ), plot_histogram( self.val_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: val", ), plot_histogram( self.test_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: test", ), ] self.logger.log_plots(plots)
def load_task(self): print("### Loading dataset: {}".format(self.config["task"]["dataset"])) dataset = registry.get_dataset_class(self.config["task"]["dataset"])( self.config["dataset"]) if self.config["task"]["dataset"] in ["qm9", "dogss"]: num_targets = dataset.data.y.shape[-1] if ("label_index" in self.config["task"] and self.config["task"]["label_index"] is not False): dataset.data.y = dataset.data.y[:, int(self.config["task"] ["label_index"])] num_targets = 1 else: num_targets = 1 self.num_targets = num_targets ( self.train_loader, self.val_loader, self.test_loader, ) = dataset.get_dataloaders( batch_size=int(self.config["optim"]["batch_size"])) # Normalizer for the dataset. # Compute mean, std of training set labels. self.normalizers = {} if self.config["dataset"].get("normalize_labels", True): self.normalizers["target"] = Normalizer( self.train_loader.dataset.data.y[ self.train_loader.dataset.__indices__], self.device, ) # If we're computing gradients wrt input, set mean of normalizer to 0 -- # since it is lost when compute dy / dx -- and std to forward target std if "grad_input" in self.config["task"]: if self.config["dataset"].get("normalize_labels", True): self.normalizers["grad_target"] = Normalizer( self.train_loader.dataset.data.y[ self.train_loader.dataset.__indices__], self.device, ) self.normalizers["grad_target"].mean.fill_(0) if self.is_vis and self.config["task"]["dataset"] != "qm9": # Plot label distribution. plots = [ plot_histogram( self.train_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: train", ), plot_histogram( self.val_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: val", ), plot_histogram( self.test_loader.dataset.data.y.tolist(), xlabel="{}/raw".format(self.config["task"]["labels"][0]), ylabel="# Examples", title="Split: test", ), ] self.logger.log_plots(plots)