def load(Cls, model=None, params=None, preprocessor=None, **kwargs): """Load a model Parameters ---------- model : str The path to load the model from the .ml4c file for inference. params : srt The path to load .params file with users' inputs. preprocessor : str The path to load the file with the sklearn preprocessor object. """ kwargs["ml4chem_path"] = model kwargs["preprocessor"] = preprocessor with open(params) as ml4chem_params: ml4chem_params = json.load(ml4chem_params) model_type = ml4chem_params["model"].get("type") if model_type == "svm": model_params = ml4chem_params["model"] del model_params["name"] # delete unneeded key, value del model_params["type"] # delete unneeded key, value from ml4chem.models.kernelridge import KernelRidge weights = load(model) # TODO remove after de/serialization is fixed. weights = { key.decode("utf-8"): value for key, value in weights.items() } model_params.update({"weights": weights}) model = KernelRidge(**model_params) else: # Instantiate the model class model_params = ml4chem_params["model"] del model_params["name"] # delete unneeded key, value del model_params["type"] # delete unneeded key, value from ml4chem.models.neuralnetwork import NeuralNetwork model = NeuralNetwork(**model_params) # Instantiation of fingerprint class fingerprint_params = ml4chem_params.get("fingerprints", None) if fingerprint_params is None: fingerprints = fingerprint_params else: name = fingerprint_params.get("name") del fingerprint_params["name"] fingerprints = dynamic_import(name, "ml4chem.fingerprints") fingerprints = fingerprints(**fingerprint_params) calc = Cls(fingerprints=fingerprints, model=model, **kwargs) return calc
def load_encoder(self, encoder, **kwargs): """Load an autoencoder in eval() mode Parameters ---------- encoder : dict Dictionary with structure: >>> encoder = {'model': file.ml4c, 'params': file.params} data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- autoencoder.eval() : obj Autoencoder model object in eval mode to get the latent space. """ params_path = encoder.get("params") model_path = encoder.get("model") model_params = json.load(open(params_path, "r")) model_params = model_params.get("model") name = model_params.pop("name") del model_params["type"] # delete unneeded key, value input_dimension = model_params.pop("input_dimension") output_dimension = model_params.pop("output_dimension") autoencoder = dynamic_import( name, "ml4chem.atomistic.models", alt_name="autoencoders" ) autoencoder = autoencoder(**model_params) autoencoder.prepare_model(input_dimension, output_dimension, **kwargs) autoencoder.load_state_dict(torch.load(model_path), strict=True) return autoencoder.eval()
def calculate(self, images, purpose="training", data=None, svm=False): """Return features per atom in an atoms object Parameters ---------- images : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} """ # Now, we need to take the inputs and convert them to the right feature # space name, kwargs = self.features features = dynamic_import(name, "ml4chem.atomistic.features") features = features(**kwargs) feature_space = features.calculate( images, data=data, purpose=purpose, svm=False ) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) encoder = self.load_encoder(self.encoder, data=data, purpose=purpose) if self.preprocessor is not None and purpose == "training": hashes, symbols, _latent_space = encoder.get_latent_space( feature_space, svm=True, purpose="preprocessing" ) _latent_space = preprocessor.fit(_latent_space, scheduler=self.scheduler) latent_space = OrderedDict() # TODO parallelize this. index = 0 for i, hash in enumerate(hashes): pairs = [] for symbol in symbols[i]: feature_vector = _latent_space[index] if svm is False: feature_vector = torch.tensor( feature_vector, requires_grad=False, dtype=torch.float ) pairs.append((symbol, feature_vector)) index += 1 latent_space[hash] = pairs del _latent_space # Save preprocessor. preprocessor.save_to_file(preprocessor, self.save_preprocessor) elif self.preprocessor is not None and purpose == "inference": hashes, symbols, _latent_space = encoder.get_latent_space( feature_space, svm=True, purpose="preprocessing" ) scaled_latent_space = preprocessor.transform(_latent_space) latent_space = OrderedDict() # TODO parallelize this. index = 0 for i, hash in enumerate(hashes): pairs = [] for symbol in symbols[i]: feature_vector = scaled_latent_space[index] if svm is False: feature_vector = torch.tensor( feature_vector, requires_grad=False, dtype=torch.float ) pairs.append((symbol, feature_vector)) index += 1 latent_space[hash] = pairs del _latent_space else: if encoder.name() == "VAE": purpose = "inference" latent_space = encoder.get_latent_space( feature_space, svm=svm, purpose=purpose ) self.feature_space = latent_space return latent_space
def train( self, inputs, targets, data=None, optimizer=(None, None), epochs=100, regularization=None, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, independent_loss=True, loss_weights=None, ): """Train the models Parameters ---------- inputs : dict Dictionary with hashed feature space. targets : list The expected values that the model has to learn aka y. model : object The NeuralNetwork class. data : object Data object created from the handler. optimizer : tuple The optimizer is a tuple with the structure: >>> ('adam', {'lr': float, 'weight_decay'=float}) epochs : int Number of full training cycles. regularization : float This is the L2 regularization. It is not the same as weight decay. convergence : dict Instead of using epochs, users can set a convergence criterion. >>> convergence = {"rmse": [0.04, 0.02]} lossfxn : obj A loss function object. device : str Calculation can be run in the cpu or cuda (gpu). batch_size : int Number of data points per batch to use for training. Default is None. lr_scheduler : tuple Tuple with structure: scheduler's name and a dictionary with keyword arguments. >>> lr_scheduler = ('ReduceLROnPlateau', {'mode': 'min', 'patience': 10}) independent_loss : bool Whether or not models' weight are optimized independently. loss_weights : list How much the loss of model(i) contributes to the total loss. """ self.epochs = epochs # Convergence criterion if isinstance(convergence["rmse"], float) or isinstance( convergence["rmse"], int): convergence["rmse"] = np.array( [convergence["rmse"] for model in range(len(self.models))]) elif isinstance(convergence["rmse"], list): if len(convergence["rmse"]) != len(self.models): raise ( "Your convergence list is not the same length of the number of models" ) convergence["rmse"] = np.array(convergence["rmse"]) logger.info(" ") logging.info("Model Merger") logging.info("============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) logging.info("Merging the following models:") for model in self.models: logging.info(" - {}.".format(model.name())) logging.info("Loss functions:") if loss_weights is None: self.loss_weights = [1.0 / len(lossfxn) for l in lossfxn] else: self.loss_weights = loss_weights for index, l in enumerate(lossfxn): logging.info(" - Name: {}; Weight: {}.".format( l.__name__, self.loss_weights[index])) logging.info("Convergence criterion: {}.".format(convergence)) # If no batch_size provided then the whole training set length is the batch. if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = [] for inputs_ in inputs: if inspect.ismethod(inputs_): chunks.append(inputs_) else: chunks.append( list(get_chunks(inputs_, batch_size, svm=False))) targets = [ list(get_chunks(target, batch_size, svm=False)) for target in targets ] atoms_per_image = list( get_chunks(data.atoms_per_image, batch_size, svm=False)) if lossfxn is None: self.lossfxn = [None for model in self.models] else: self.lossfxn = lossfxn self.device = device # Population of extra Attributes needed by the models, and further data # preprocessing for index, loss in enumerate(lossfxn): _args, _varargs, _keywords, _defaults = inspect.getargspec(loss) if "latent" in _args: train = dynamic_import("train", "ml4chem.atomistic.models", alt_name="autoencoders") self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index]) else: self.inputs_chunk_vals = None parameters = [] for index, model in enumerate(self.models): parameters += model.parameters() if model.name() == "PytorchPotentials": # These models require targets as tensors self.atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) _targets = [ torch.tensor(batch, requires_grad=False) for batch in targets[index] ] targets[index] = _targets del _targets elif model.name() in ModelMerger.autoencoders: targets[index] = lod_to_list(targets[index]) # Data scattering client = dask.distributed.get_client() # self.targets = [client.scatter(target) for target in targets] self.targets = [target for target in targets] self.chunks = [] for i, chunk in enumerate(chunks): if inspect.ismethod(chunk) is False: self.chunks.append(client.scatter(chunk)) else: # This list comprehension is useful to have the same number of # functions as the same number of chunks without users' input. chunk = [chunk for _ in range(len(self.targets[i]))] self.chunks.append(chunk) del chunks logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches:") for index, c in enumerate(self.chunks): logging.info(" - Model {}, {}.".format(index, len(c))) logging.info("Batch size: {} elements per batch.\n".format(batch_size)) # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, parameters) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp", "Loss", "RMSE (ave)")) logger.info("{:6s} {:19s} {:12s} {:8s}".format("------", "-------------------", "------------", "--------------")) converged = False epoch = 0 if independent_loss is False: # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]] self.chunks = list(map(list, zip(*self.chunks))) old_state_dict = {} for key in self.models[1].state_dict(): old_state_dict[key] = self.models[1].state_dict()[key].clone() from ml4chem.atomistic.models.autoencoders import Annealer annealer = Annealer() while not converged: epoch += 1 self.annealing = annealer.update(epoch) self.optimizer.zero_grad() # clear previous gradients if independent_loss: losses = [] outputs = [] for model_index, model in enumerate(self.models): loss, output = self.closure(model_index, model, independent_loss, name=model.name()) losses.append(loss) outputs.append(output) else: loss, outputs = self.closure(index, self.models, independent_loss) rmse = [] for i, model in enumerate(self.models): outputs_ = outputs[i] targets_ = self.targets[i] if model.name() == "VAE": # VAE usually returns a complex output with mus and sigmas # but we only need mus at this stage. outputs_ = [sublist[0] for sublist in outputs_] rmse.append(compute_rmse(outputs_, targets_)) rmse = np.array(rmse) _rmse = np.average(rmse) if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse)) if convergence is None and epoch == self.epochs: converged = True elif convergence is not None and (rmse <= convergence["rmse"]).all(): converged = True new_state_dict = {} for key in self.models[1].state_dict(): new_state_dict[key] = self.models[1].state_dict( )[key].clone() for key in old_state_dict: if not (old_state_dict[key] == new_state_dict[key]).all(): print("Diff in {}".format(key)) else: print("No diff in {}".format(key)) print(convergence) print(rmse) print("Final") print(convergence) print(rmse)
def closure(self, index, model, independent_loss, name=None): """Closure This method clears previous gradients, iterates over batches, accumulates the gradients, reduces the gradients, update model params, and finally returns loss and outputs_. Parameters ---------- index : int Index of model. model : obj Model object. independent_loss : bool Whether or not models' weight are optimized independently. name : str, optional Model class's name, by default None. Returns ------- loss, outputs A tuple with loss function magnitudes and tensor with outputs. """ client = dask.distributed.get_client() if name == "PytorchPotentials" and independent_loss: train = dynamic_import("train", "ml4chem.atomistic.models", alt_name="neuralnetwork") inputs = [] # FIXME this is not scaling to n number of models. for chunk_index, chunk in enumerate(self.chunks[index - 1]): inputs_ = self.chunks[index][chunk_index](OrderedDict( chunk.result())) inputs.append(client.scatter(inputs_)) loss, outputs_ = train.closure( inputs, self.targets[index], model, self.lossfxn[index], self.atoms_per_image, self.device, ) return loss, outputs_ elif name in ModelMerger.autoencoders and independent_loss: train = dynamic_import("train", "ml4chem.atomistic.models", alt_name="autoencoders") targets = self.targets[index] loss, outputs_ = train.closure( self.chunks[index], targets, model, self.lossfxn[index], self.device, self.inputs_chunk_vals, ) return loss, outputs_ else: # Models are dependent on each other running_loss = torch.tensor(0, dtype=torch.float) accumulation = [] for index, chunk in enumerate(self.chunks): accumulation.append( client.submit( self.train_batches, *( index, chunk, self.targets, self.models, self.lossfxn, self.atoms_per_image, self.device, ))) dask.distributed.wait(accumulation) accumulation = client.gather(accumulation) grads = {} outputs_ = {} losses = {} for model_index, (outputs, loss, grad) in enumerate(accumulation): for model_index in range(len(self.models)): if model_index not in grads.keys(): grads[model_index] = [] outputs_[model_index] = [] losses[model_index] = [] running_loss += loss[model_index] losses[model_index].append(loss[model_index]) grads[model_index].append(np.array(grad[model_index])) outputs_[model_index].append(outputs[model_index]) # Sum gradients per model for key, grad in grads.items(): grads[key] = sum(grad) # Update the gradients of the model for model_index, model in enumerate(self.models): for index, param in enumerate(model.parameters()): param.grad = torch.tensor(grads[model_index][index]) return running_loss, outputs_
def load(Cls, model=None, params=None, preprocessor=None, **kwargs): """Load ML4Chem models Parameters ---------- model : str The path to load the model from the .ml4c file for inference. params : srt The path to load .params file with users' inputs. preprocessor : str The path to load the file with the sklearn preprocessor object. """ kwargs["ml4chem_path"] = model kwargs["preprocessor"] = preprocessor with open(params, "rb") as ml4chem_params: ml4chem_params = json.load(ml4chem_params) model_type = ml4chem_params["model"].get("type") model_params = ml4chem_params["model"] class_name = model_params["class_name"] module_name = Potentials.module_names[model_params["name"]] model_class = dynamic_import(class_name, "ml4chem.atomistic.models", alt_name=module_name) delete = ["name", "type", "class_name"] for param in delete: # delete unneeded (key, value) pairs. del model_params[param] if model_type == "svm": weights = load(model) # TODO remove after de/serialization is fixed. try: weights = { key.decode("utf-8"): value for key, value in weights.items() } except AttributeError: weights = {key: value for key, value in weights.items()} model_params.update({"weights": weights}) model = model_class(**model_params) else: # Instantiate the model class model = model_class(**model_params) # Instantiation of fingerprint class fingerprint_params = ml4chem_params.get("features", None) if fingerprint_params == None: features = None else: if "kwargs" in fingerprint_params.keys(): update_dict_with = fingerprint_params.pop("kwargs") fingerprint_params.update(update_dict_with) if fingerprint_params is None: features = fingerprint_params else: name = fingerprint_params.get("name") del fingerprint_params["name"] features = dynamic_import(name, "ml4chem.atomistic.features") features = features(**fingerprint_params) calc = Cls(features=features, model=model, **kwargs) return calc
def train(self, training_set, epochs=100, lr=0.001, convergence=None, device="cpu", optimizer=(None, None), lossfxn=None, regularization=0.0, batch_size=None, **kwargs): """Method to train models Parameters ---------- training_set : object, list List containing the training set. epochs : int Number of full training cycles. lr : float Learning rate. convergence : dict Instead of using epochs, users can set a convergence criterion. device : str Calculation can be run in the cpu or cuda (gpu). optimizer : tuple The optimizer is a tuple with the structure: >>> ('adam', {'lr': float, 'weight_decay'=float}) lossfxn : object A loss function object. regularization : float This is the L2 regularization. It is not the same as weight decay. batch_size : int Number of data points per batch to use for training. Default is None. """ purpose = "training" # Raw input and targets aka X, y data_handler = Data(training_set, purpose=purpose) training_set, targets = data_handler.get_data(purpose=purpose) # Now let's featurize # SVM models if self.model.name() in Potentials.svm_models: # Mapping raw positions into a feature space aka X feature_space, reference_features = self.features.calculate( training_set, data=data_handler, purpose=purpose, svm=True) self.model.prepare_model(feature_space, reference_features, data=data_handler) self.model.train(feature_space, targets) else: # Mapping raw positions into a feature space aka X feature_space = self.features.calculate(training_set, data=data_handler, purpose=purpose, svm=False) # Fixed fingerprint dimension input_dimension = len(list(feature_space.values())[0][0][-1]) self.model.prepare_model(input_dimension, data=data_handler) # CUDA stuff if device == "cuda": logger.info("Checking if CUDA is available...") use_cuda = torch.cuda.is_available() if use_cuda: count = torch.cuda.device_count() logger.info( "ML4Chem found {} CUDA devices available.".format( count)) for index in range(count): device_name = torch.cuda.get_device_name(index) if index == 0: device_name += " (Default)" logger.info(" - {}.".format(device_name)) else: logger.warning("No CUDA available. We will use CPU.") device = "cpu" device_ = torch.device(device) self.model.to(device_) # This is something specific of pytorch. module = Potentials.module_names[self.model.name()] train = dynamic_import("train", "ml4chem.atomistic.models", alt_name=module) # Let's train train(feature_space, targets, model=self.model, data=data_handler, optimizer=optimizer, regularization=regularization, epochs=epochs, convergence=convergence, lossfxn=lossfxn, device=device, batch_size=batch_size, **kwargs) self.save(self.model, features=self.features, path=self.path, label=self.label)
def train(self, inputs, targets, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, independent_loss=True, loss_weights=None): logger.info(" ") logging.info("Model Merger") logging.info("============") logging.info("Merging the following models:") for model in self.models: logging.info(" - {}.".format(model.name())) logging.info("Loss functions:") if loss_weights is None: self.loss_weights = [1. / len(lossfxn) for l in lossfxn] else: self.loss_weights = loss_weights for l in lossfxn: logging.info(" - {}.".format(l.__name__)) # If no batch_size provided then the whole training set length is the batch. if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = [] for inputs_ in inputs: if inspect.ismethod(inputs_): chunks.append(inputs_) else: chunks.append( list(get_chunks(inputs_, batch_size, svm=False))) targets = [ list(get_chunks(target, batch_size, svm=False)) for target in targets ] atoms_per_image = list( get_chunks(data.atoms_per_image, batch_size, svm=False)) if lossfxn is None: self.lossfxn = [None for model in self.models] else: self.lossfxn = lossfxn self.device = device # Population of extra Attributes needed by the models, and further data # preprocessing for index, loss in enumerate(lossfxn): _args, _varargs, _keywords, _defaults = inspect.getargspec(loss) if "latent" in _args: train = dynamic_import("train", "ml4chem.models", alt_name="autoencoders") self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index]) parameters = [] for index, model in enumerate(self.models): parameters += model.parameters() if model.name() == "PytorchPotentials": # These models require targets as tensors self.atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) _targets = [ torch.tensor(batch, requires_grad=False) for batch in targets[index] ] targets[index] = _targets del _targets elif model.name() == "AutoEncoder": targets[index] = lod_to_list(targets[index]) # Data scattering client = dask.distributed.get_client() # self.targets = [client.scatter(target) for target in targets] self.targets = [target for target in targets] self.chunks = [] for i, chunk in enumerate(chunks): if inspect.ismethod(chunk) is False: self.chunks.append(client.scatter(chunk)) else: # This list comprehension is useful to have the same number of # functions as the same number of chunks without users' input. chunk = [chunk for _ in range(len(self.targets[i]))] self.chunks.append(chunk) del chunks logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches:") for index, c in enumerate(self.chunks): logging.info(' - Model {}, {}.'.format(index, len(c))) logging.info("Batch size: {} elements per batch.\n".format(batch_size)) # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, parameters) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp", "Loss", "RMSE (ave)")) logger.info("{:6s} {:19s} {:12s} {:8s}".format("------", "-------------------", "------------", "--------------")) converged = False epoch = 0 if independent_loss is False: # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]] self.chunks = list(map(list, zip(*self.chunks))) old_state_dict = {} for key in self.models[1].state_dict(): old_state_dict[key] = self.models[1].state_dict()[key].clone() while not converged: epoch += 1 self.optimizer.zero_grad() # clear previous gradients if independent_loss: losses = [] for model_index, model in enumerate(self.models): name = model.name() loss, outputs = self.closure(model_index, model, independent_loss, name=name) losses.append(loss) else: loss, outputs = self.closure(index, self.models, independent_loss) rmse = [] for i, model in enumerate(self.models): rmse.append(compute_rmse(outputs[i], self.targets[i])) # print(outputs[1]) # print(targets[1]) # print(rmse) _rmse = np.average(rmse) if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse)) if convergence is None and epoch == self.epochs: converged = True elif convergence is not None and all(i <= convergence["rmse"] for i in rmse): converged = True new_state_dict = {} for key in self.models[1].state_dict(): new_state_dict[key] = self.models[1].state_dict( )[key].clone() for key in old_state_dict: if not (old_state_dict[key] == new_state_dict[key]).all(): print('Diff in {}'.format(key)) else: print('No diff in {}'.format(key))