def trainer(self): """Run the training class""" converged = False _loss = [] _rmse = [] epoch = 0 while not converged: epoch += 1 self.optimizer.zero_grad() # clear previous gradients loss, outputs_ = train.closure( self.chunks, self.targets, self.model, self.lossfxn, self.atoms_per_image, self.device, ) # We step the optimizer if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) # RMSE per image and per/atom client = dask.distributed.get_client() rmse = client.submit(compute_rmse, *(outputs_, self.targets)) atoms_per_image = self.atoms_per_image.view(1, -1) rmse_atom = client.submit( compute_rmse, *(outputs_, self.targets, atoms_per_image)) rmse = rmse.result() rmse_atom = rmse_atom.result() _loss.append(loss.item()) _rmse.append(rmse) # In the case that lr_scheduler is not None if self.lr_scheduler is not None: self.scheduler.step(loss) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f} {:8f}".format( epoch, ts, loss, rmse, rmse_atom)) if self.convergence is None and epoch == self.epochs: converged = True elif self.convergence is not None and rmse < self.convergence[ "energy"]: converged = True training_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(training_time) logger.info( "Training finished in {} hours {} minutes {:.2f} seconds.".format( h, m, s))
def calculate_features(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the DataSet class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ logger.info(" ") logger.info("Fingerprinting") logger.info("==============") # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) # we make the features self.GP = self.custom.get("GP", None) if self.GP is None: custom = self.custom.get("user_input", None) self.GP = self.make_symmetry_functions( unique_element_symbols, custom=custom, angular_type=self.angular_type) self.custom.update({"GP": self.GP}) else: logger.info( 'Using parameters from file to create symmetry functions...\n') self.print_fingerprint_params(self.GP) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic fingerprints. logger.info("") logger.info("Adding atomic feature calculations to scheduler...") ini = end = 0 computations = [] atoms_index_map = [ ] # This list is used to reconstruct images from atoms. for image in images.items(): key, image = image end = ini + len(image) atoms_index_map.append(list(range(ini, end))) ini = end for atom in image: index = atom.index symbol = atom.symbol nl = get_neighborlist(image, cutoff=self.cutoff) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols = np.array(image.get_chemical_symbols())[n_indices] neighborpositions = image.positions[n_indices] + np.dot( n_offsets, image.get_cell()) afp = self.get_atomic_fingerprint( atom, index, symbol, n_symbols, neighborpositions, self.preprocessor, image_molecule=image, weighted=self.weighted, n_indices=n_indices, ) computations.append(afp) scheduler_time = time.time() - initial_time h, m, s = convert_elapsed_time(scheduler_time) logger.info("... finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) # In this block we compute the fingerprints. logger.info("") logger.info("Computing fingerprints...") stacked_features = dask.compute(*computations, scheduler=self.scheduler) if self.preprocessor is not None: stacked_features = np.array(stacked_features) # Clean del computations if purpose == "training": # To take advantage of dask_ml we need to convert our numpy array # into a dask array. client = dask.distributed.get_client() if self.preprocessor is not None: scaled_feature_space = [] dim = stacked_features.shape stacked_features = dask.array.from_array(stacked_features, chunks=dim) stacked_features = preprocessor.fit(stacked_features, scheduler=self.scheduler) atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) scaled_feature_space.append(features) # More data processing depending on the method used. else: feature_space = [] atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) feature_space.append(features) del stacked_features computations = [] if svm: reference_space = [] for i, image in enumerate(images.items()): computations.append( self.restack_image( i, image, scaled_feature_space=scaled_feature_space, svm=svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: reference_space.append( self.restack_atom(i, atom, scaled_feature_space)) reference_space = dask.compute(*reference_space, scheduler=self.scheduler) else: try: for i, image in enumerate(images.items()): computations.append( self.restack_image( i, image, scaled_feature_space=scaled_feature_space, svm=svm, )) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): computations.append( self.restack_image(i, image, feature_space=feature_space, svm=svm)) feature_space = dask.compute(*computations, scheduler=self.scheduler) feature_space = OrderedDict(feature_space) del computations preprocessor.save_to_file(preprocessor, self.save_preprocessor) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm: if self.filename is not None: logger.info("Fingerprints saved to {}.".format( self.filename)) data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) return feature_space, reference_space else: if self.filename is not None: logger.info("Fingerprints saved to {}.".format( self.filename)) dump(feature_space, filename=self.filename) return feature_space elif purpose == "inference": feature_space = OrderedDict() scaled_feature_space = preprocessor.transform(stacked_features) # TODO this has to be parallelized. for key, image in images.items(): if key not in feature_space.keys(): feature_space[key] = [] for index, atom in enumerate(image): symbol = atom.symbol if svm: scaled = scaled_feature_space[index] # TODO change this to something more elegant later try: self.reference_space except AttributeError: # If self.reference does not exist it means that # reference_space is being loaded by Messagepack. symbol = symbol.encode("utf-8") else: scaled = torch.tensor( scaled_feature_space[index], requires_grad=False, dtype=torch.float, ) feature_space[key].append((symbol, scaled)) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) return feature_space
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, ): self.initial_time = time.time() atoms_per_image = data.atoms_per_image if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): # Data batches chunks = list(get_chunks(inputs, batch_size, svm=False)) targets = list(get_chunks(targets, batch_size, svm=False)) atoms_per_image = list( get_chunks(atoms_per_image, batch_size, svm=False)) logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches: {}.".format(len(chunks))) logging.info("Batch size: {} elements per batch.".format(batch_size)) logger.info(" ") atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) targets = torch.tensor(targets, requires_grad=False) if device == "cuda": logger.info("Moving data to CUDA...") atoms_per_image = atoms_per_image.cuda() targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info("Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format(h, m, s)) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters()) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format( "Epoch", "Time Stamp", "Loss", "RMSE/img", "RMSE/atom")) logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format( "------", "-------------------", "------------", "--------", "---------")) self.atoms_per_image = atoms_per_image self.convergence = convergence self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] if lossfxn is None: self.lossfxn = AtomicMSELoss else: self.lossfxn = lossfxn # Let the hunger games begin... self.trainer()
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, uncertainty=None, checkpoint=None, test=None, ): self.initial_time = time.time() if lossfxn is None: lossfxn = AtomicMSELoss logger.info("") logger.info("Training") logger.info("========") logger.info(f"Convergence criteria: {convergence}") logger.info(f"Loss function: {lossfxn.__name__}") if uncertainty is not None: logger.info("Options:") logger.info(f" - Uncertainty penalization: {pformat(uncertainty)}") logger.info("") atoms_per_image = data.atoms_per_image if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): # Data batches chunks = list(get_chunks(inputs, batch_size, svm=False)) targets = list(get_chunks(targets, batch_size, svm=False)) atoms_per_image = list(get_chunks(atoms_per_image, batch_size, svm=False)) if uncertainty != None: uncertainty = list(get_chunks(uncertainty, batch_size, svm=False)) uncertainty = [ torch.tensor(u, requires_grad=False, dtype=torch.float) for u in uncertainty ] logger.info("") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches: {}.".format(len(chunks))) logging.info("Batch size: {} elements per batch.".format(batch_size)) logger.info(" ") atoms_per_image = [ torch.tensor(n_atoms, requires_grad=False, dtype=torch.float) for n_atoms in atoms_per_image ] targets = [torch.tensor(t, requires_grad=False) for t in targets] if device == "cuda": logger.info("Moving data to CUDA...") atoms_per_image = atoms_per_image.cuda() targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info( "Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format( h, m, s ) ) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters() ) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) self.atoms_per_image = atoms_per_image self.convergence = convergence self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler self.lossfxn = lossfxn self.checkpoint = checkpoint self.test = test # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] if uncertainty != None: self.uncertainty = [client.scatter(u) for u in uncertainty] else: self.uncertainty = uncertainty # Let the hunger games begin... self.trainer()
def trainer(self): """Run the training class""" logger.info(" ") logger.info("Starting training...\n") if self.test is None: logger.info( "{:6s} {:19s} {:12s} {:12s} {:8s}".format( "Epoch", "Time Stamp", "Loss", "Error/img", "Error/atom" ) ) logger.info( "{:6s} {:19s} {:12s} {:8s} {:8s}".format( "------", "-------------------", "------------", "------------", "------------", ) ) else: test_features = self.test.get("features", None) test_targets = self.test.get("targets", None) test_data = self.test.get("data", None) logger.info( "{:6s} {:19s} {:12s} {:12s} {:12s} {:12s} {:16s}".format( "Epoch", "Time Stamp", "Loss", "Error/img", "Error/atom", "Error/img (t)", "Error/atom (t)", ) ) logger.info( "{:6s} {:19s} {:12s} {:8s} {:8s} {:8s} {:8s}".format( "------", "-------------------", "------------", "------------", "------------", "------------", "------------", ) ) converged = False _loss = [] _rmse = [] epoch = 0 client = dask.distributed.get_client() while not converged: epoch += 1 self.optimizer.zero_grad() # clear previous gradients loss, outputs_ = train.closure( self.chunks, self.targets, self.uncertainty, self.model, self.lossfxn, self.atoms_per_image, self.device, ) # We step the optimizer if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = {"closure": self.closure, "current_loss": loss, "max_ls": 10} self.optimizer.step(options) # RMSE per image and per/atom rmse = client.submit(compute_rmse, *(outputs_, self.targets)) atoms_per_image = torch.cat(self.atoms_per_image) rmse_atom = client.submit( compute_rmse, *(outputs_, self.targets, atoms_per_image) ) rmse = rmse.result() rmse_atom = rmse_atom.result() _loss.append(loss.item()) _rmse.append(rmse) # In the case that lr_scheduler is not None if self.lr_scheduler is not None: self.scheduler.step(loss) print("Epoch {} lr {}".format(epoch, get_lr(self.optimizer))) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") if self.test is None: logger.info( "{:6d} {} {:8e} {:4e} {:4e}".format( epoch, ts, loss.detach(), rmse, rmse_atom ) ) else: test_model = self.model.eval() test_predictions = test_model(test_features).detach() rmse_test = client.submit( compute_rmse, *(test_predictions, test_targets) ) atoms_per_image_test = torch.tensor( test_data.atoms_per_image, requires_grad=False ) rmse_atom_test = client.submit( compute_rmse, *(test_predictions, test_targets, atoms_per_image_test), ) rmse_test = rmse_test.result() rmse_atom_test = rmse_atom_test.result() logger.info( "{:6d} {} {:8e} {:4e} {:4e} {:4e} {:4e}".format( epoch, ts, loss.detach(), rmse, rmse_atom, rmse_test, rmse_atom_test, ) ) if self.checkpoint is not None: self.checkpoint_save(epoch, self.model, **self.checkpoint) if self.convergence is None and epoch == self.epochs: converged = True elif self.convergence is not None and rmse < self.convergence["energy"]: converged = True training_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(training_time) logger.info( "Training finished in {} hours {} minutes {:.2f} seconds.".format(h, m, s) )
def trainer(self): """Run the training class""" converged = False _loss = [] _rmse = [] epoch = 0 annealer = Annealer() while not converged: epoch += 1 if self.anneal: annealing = annealer.update(epoch) print(annealing) else: annealing = None self.optimizer.zero_grad() # clear previous gradients args = { "chunks": self.chunks, "targets": self.targets, "model": self.model, "lossfxn": self.lossfxn, "device": self.device, "inputs_chunk_vals": self.inputs_chunk_vals, "annealing": annealing, } if self.penalize_latent: args.update({"penalize_latent": self.penalize_latent}) loss, outputs_ = train.closure(**args) if self.optimizer_name != "LBFGS": self.optimizer.step() else: self.optimizer.extra_arguments = args options = {"closure": train.closure, "current_loss": loss, "max_ls": 10} self.optimizer.step(options) # RMSE per image and per/atom rmse = [] client = dask.distributed.get_client() rmse = client.submit(compute_rmse, *(outputs_, self.targets)) rmse = rmse.result() _loss.append(loss.item()) _rmse.append(rmse) if self.lr_scheduler is not None: self.scheduler.step(loss) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, rmse)) if self.convergence is not None and rmse < self.convergence["rmse"]: converged = True elif self.convergence is not None and epoch == self.epochs: converged = True elif self.convergence is None and epoch == self.epochs: converged = True # elif cycles == stop: # converged = True training_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(training_time) logger.info( "Training finished in {} hours {} minutes {:.2f} seconds.".format(h, m, s) )
def calculate_features(self, images=None, purpose="training", data=None, svm=False): """Return features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the DataSet class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} """ logger.info(" ") logger.info("Fingerprinting") logger.info("==============") if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space else: return data initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations with delayed functions to operate # with dask's scheduler. These computations get cartesian coordinates. computations = [] for image in images.items(): key, image = image feature_vectors = [] computations.append(feature_vectors) for atom in image: if self.preprocessor is not None: # In this case we will preprocess data and need numpy # arrays to operate with sklearn. afp = self.get_atomic_features(atom, svm=True) feature_vectors.append(afp[1]) else: afp = self.get_atomic_features(atom, svm=svm) feature_vectors.append(afp) # In this block we compute the delayed functions in computations. feature_space = dask.compute(*computations, scheduler=self.scheduler) hashes = list(images.keys()) if self.preprocessor is not None and purpose == "training": feature_space = np.array(feature_space) dim = feature_space.shape if len(dim) > 1: d1, d2, d3 = dim feature_space = feature_space.reshape(d1 * d2, d3) feature_space = preprocessor.fit(feature_space, scheduler=self.scheduler) feature_space = feature_space.reshape(d1, d2, d3) else: atoms_index_map = [] stack = [] d1 = ini = end = 0 for i in feature_space: end = ini + len(i) atoms_map = list(range(ini, end)) atoms_index_map.append(atoms_map) ini = end for j in i: stack.append(j) d1 += 1 feature_space = np.array(stack) d2 = len(stack[0]) del stack # More data processing depending on the method used. computations = [] if svm: reference_space = [] for i, image in enumerate(images.items()): computations.append( self.restack_image(i, image, feature_space, svm=svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: reference_space.append( self.restack_atom(i, atom, feature_space)) reference_space = dask.compute(*reference_space, scheduler=self.scheduler) else: for i, image in enumerate(images.items()): computations.append( self.restack_image(i, image, feature_space, svm=svm)) feature_space = dask.compute(*computations, scheduler=self.scheduler) feature_space = OrderedDict(feature_space) # Save preprocessor. preprocessor.save_to_file(preprocessor, self.save_preprocessor) elif self.preprocessor is not None and purpose == "inference": # We take stacked features and preprocess them stacked_features = np.array(feature_space) d1, d2, d3 = stacked_features.shape stacked_features = stacked_features.reshape(d1 * d2, d3) feature_space = OrderedDict() scaled_feature_space = preprocessor.transform(stacked_features) # Once preprocessed, they are wrapped as a dictionary. # TODO this has to be parallelized. for key, image in images.items(): if key not in feature_space.keys(): feature_space[key] = [] for index, atom in enumerate(image): symbol = atom.symbol if svm: scaled = scaled_feature_space[index] # TODO change this to something more elegant later try: self.reference_space except AttributeError: # If self.reference does not exist it means that # reference_space is being loaded by Messagepack. symbol = symbol.encode("utf-8") else: scaled = torch.tensor( scaled_feature_space[index], requires_grad=False, dtype=torch.float, ) feature_space[key].append((symbol, scaled)) else: feature_space = OrderedDict(zip(hashes, feature_space)) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Fingerprinting finished in {} hours {} minutes {:.2f} " "seconds.\n".format(h, m, s)) if svm: data = {"feature_space": feature_space} dump(data, filename=self.filename) else: dump(feature_space, filename=self.filename) return feature_space
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, **kwargs ): supported_keys = ["anneal", "penalize_latent"] if len(kwargs.items()) == 0: for k in supported_keys: setattr(self, k, None) else: for k, v in kwargs.items(): if k in supported_keys: setattr(self, k, v) self.initial_time = time.time() if device == "cuda": pass """ logger.info('Moving data to CUDA...') targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) del inputs inputs = _inputs move_time = time.time() - initial_time h, m, s = convert_elapsed_time(move_time) logger.info('Data moved to GPU in {} hours {} minutes {:.2f} seconds.' .format(h, m, s)) """ if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = list(get_chunks(inputs, batch_size, svm=False)) targets_ = list(get_chunks(targets, batch_size, svm=False)) del targets # This change is needed because the targets are features or # positions and they are built as a dictionary. targets = lod_to_list(targets_) logging.info("Batch size: {} elements per batch.".format(batch_size)) if device == "cuda": logger.info("Moving data to CUDA...") targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info( "Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format( h, m, s ) ) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters() ) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) if lossfxn is None: self.lossfxn = MSELoss self.inputs_chunk_vals = None else: logger.info("Using custom loss function...") logger.info("") self.lossfxn = lossfxn self.inputs_chunk_vals = self.get_inputs_chunks(chunks) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info( "{:6s} {:19s} {:12s} {:9s}".format("Epoch", "Time Stamp", "Loss", "Rec Err") ) logger.info( "{:6s} {:19s} {:12s} {:9s}".format( "------", "-------------------", "------------", "--------" ) ) # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler self.convergence = convergence # Let the hunger game begin... self.trainer()
def calculate(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ client = dask.distributed.get_client() logger.info(" ") logger.info("Featurization") logger.info("=============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) logger.info(f"Module name: {self.name()}.") # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning(f"Loading features from {self.filename}.") logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info(f"Getting unique element symbols for {purpose}") unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info(f"Unique chemical elements: {unique_element_symbols}") elif isinstance(data.unique_element_symbols, dict): unique_element_symbols = data.unique_element_symbols[purpose] logger.info(f"Unique chemical elements: {unique_element_symbols}") # we make the features self.GP = self.custom.get("GP", None) if self.GP is None: custom = self.custom.get("user_input", None) self.GP = self.make_symmetry_functions( unique_element_symbols, custom=custom, angular_type=self.angular_type) self.custom.update({"GP": self.GP}) else: logger.info( "Using parameters from file to create symmetry functions...\n") self.print_features_params(self.GP) symbol = data.unique_element_symbols[purpose][0] sample = np.zeros(len(self.GP[symbol])) self.dimension = len(sample) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic features. logger.info("") logger.info( "Embarrassingly parallel computation of atomic features...") stacked_features = [] atoms_index_map = [ ] # This list is used to reconstruct images from atoms. if self.batch_size is None: self.batch_size = data.get_total_number_atoms() chunks = get_chunks(images, self.batch_size, svm=svm) ini = end = 0 for chunk in chunks: images_ = OrderedDict(chunk) intermediate = [] for image in images_.items(): _, image = image end = ini + len(image) atoms_index_map.append(list(range(ini, end))) ini = end for atom in image: index = atom.index symbol = atom.symbol cutoff_keys = ["radial", "angular"] n_symbols, neighborpositions = {}, {} if isinstance(self.cutoff, dict): for cutoff_key in cutoff_keys: nl = get_neighborlist( image, cutoff=self.cutoff[cutoff_key]) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols_ = np.array( image.get_chemical_symbols())[n_indices] n_symbols[cutoff_key] = n_symbols_ neighborpositions_ = image.positions[ n_indices] + np.dot(n_offsets, image.get_cell()) neighborpositions[cutoff_key] = neighborpositions_ else: for cutoff_key in cutoff_keys: nl = get_neighborlist(image, cutoff=self.cutoff) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols_ = np.array( image.get_chemical_symbols())[n_indices] n_symbols[cutoff_key] = n_symbols_ neighborpositions_ = image.positions[ n_indices] + np.dot(n_offsets, image.get_cell()) neighborpositions[cutoff_key] = neighborpositions_ afp = self.get_atomic_features( atom, index, symbol, n_symbols, neighborpositions, image_molecule=image, weighted=self.weighted, n_indices=n_indices, ) intermediate.append(afp) intermediate = client.persist(intermediate, scheduler=self.scheduler) stacked_features += intermediate del intermediate scheduler_time = time.time() - initial_time dask.distributed.wait(stacked_features) h, m, s = convert_elapsed_time(scheduler_time) logger.info("... finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) logger.info("") if self.preprocessor is not None: scaled_feature_space = [] # To take advantage of dask_ml we need to convert our numpy array # into a dask array. logger.info("Converting features to dask array...") stacked_features = [ da.from_delayed(lazy, dtype=float, shape=sample.shape) for lazy in stacked_features ] layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1} # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout) stacked_features = da.stack(stacked_features, axis=0).rechunk(layout) logger.info("Shape of array is {} and chunks {}.".format( stacked_features.shape, stacked_features.chunks)) # Note that dask_ml by default convert the output of .fit # in a concrete value. if purpose == "training": stacked_features = preprocessor.fit(stacked_features, scheduler=self.scheduler) else: stacked_features = preprocessor.transform(stacked_features) atoms_index_map = [ client.scatter(indices) for indices in atoms_index_map ] # stacked_features = [client.scatter(features) for features in stacked_features] stacked_features = client.scatter(stacked_features, broadcast=True) logger.info("Stacking features using atoms index map...") for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) # features = self.stack_features(indices, stacked_features) scaled_feature_space.append(features) else: scaled_feature_space = [] atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] stacked_features = client.scatter(stacked_features, broadcast=True) for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) scaled_feature_space.append(features) scaled_feature_space = client.gather(scaled_feature_space) # Clean del stacked_features # Restack images feature_space = [] if svm and purpose == "training": logger.info("Building array with reference space.") reference_space = [] for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: restacked_atom = client.submit( self.restack_atom, *(i, atom, scaled_feature_space)) reference_space.append(restacked_atom) feature_space.append(restacked) reference_space = client.gather(reference_space) elif svm is False and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) else: try: for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): restacked = client.submit(self.restack_image, *(i, image, feature_space, svm)) feature_space.append(restacked) feature_space = client.gather(feature_space) feature_space = OrderedDict(feature_space) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Featurization finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info(f"features saved to {self.filename}.") data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) self.feature_space = feature_space self.reference_space = reference_space return self.feature_space, self.reference_space elif svm is False and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info(f"features saved to {self.filename}.") dump(feature_space, filename=self.filename) self.feature_space = feature_space return self.feature_space else: self.feature_space = feature_space return self.feature_space
def calculate(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ client = dask.distributed.get_client() logger.info(" ") logger.info("Featurization") logger.info("=============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) elif isinstance(data.unique_element_symbols, dict): unique_element_symbols = data.unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) # we make the features preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic features. logger.info("") logger.info( "Embarrassingly parallel computation of atomic features...") stacked_features = [] atoms_symbols_map = [ ] # This list is used to reconstruct images from atoms. if self.batch_size is None: self.batch_size = data.get_total_number_atoms() chunks = get_chunks(images, self.batch_size, svm=svm) for chunk in chunks: images_ = OrderedDict(chunk) intermediate = [] for image in images_.items(): key, image = image atoms_symbols_map.append(image.get_chemical_symbols()) # Use .create() class method from dscribe. _features = dask.delayed(self.create)(image) intermediate.append(_features) intermediate = client.compute(intermediate, scheduler=self.scheduler) stacked_features += intermediate del intermediate # scheduler_time = time.time() - initial_time # dask.distributed.wait(stacked_features) logger.info("") if self.preprocessor is not None: raise NotImplementedError else: scaled_feature_space = [] atoms_symbols_map = [ client.scatter(chunk) for chunk in atoms_symbols_map ] stacked_features = client.scatter(stacked_features, broadcast=True) for image_index, symbols in enumerate(atoms_symbols_map): features = client.submit( self.stack_features, *(symbols, image_index, stacked_features)) scaled_feature_space.append(features) scaled_feature_space = client.gather(scaled_feature_space) # Clean del stacked_features # Restack images feature_space = [] if svm and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) elif svm is False and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) else: try: for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): restacked = client.submit(self.restack_image, *(i, image, feature_space, svm)) feature_space.append(restacked) feature_space = client.gather(feature_space) if svm and purpose == "training": # FIXME This might need to be improved logger.info("Building array with reference space.") hashes, reference_space = list(zip(*feature_space)) del hashes reference_space = list( itertools.chain.from_iterable(reference_space)) logger.info("Finished reference space.") feature_space = OrderedDict(feature_space) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Featurization finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info("features saved to {}.".format(self.filename)) data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) self.feature_space = feature_space self.reference_space = reference_space return self.feature_space, self.reference_space elif svm is False and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info("features saved to {}.".format(self.filename)) dump(feature_space, filename=self.filename) self.feature_space = feature_space return self.feature_space else: self.feature_space = feature_space return self.feature_space
def prepare_model(self, feature_space, reference_features, data=None, purpose="training"): """Prepare the Kernel Ridge Regression model Parameters ---------- feature_space : dict A dictionary with hash, fingerprint structure. reference_features : dict A dictionary with raveled tuples of symbol, atomic fingerprint. data : object DataSet object created from the handler. purpose : str Purpose of this model: 'training', 'inference'. Notes ----- This method builds the atomic kernel matrices and the LT vectors needed to apply the atomic decomposition Ansatz. """ if purpose == "training": logger.info("Model Training") logger.info("Model name: {}.".format(self.name())) logger.info("Kernel parameters:") logger.info(" - Kernel function: {}.".format(self.kernel)) logger.info(" - Sigma: {}.".format(self.sigma)) logger.info(" - Lamda: {}.".format(self.lamda)) dim = len(reference_features) """ Atomic kernel matrices """ initial_time = time.time() logger.info("Computing Kernel Matrix...") # We start populating computations with delayed functions to # operate with dask's scheduler logger.warning(" Adding calculations to scheduler...") computations = self.get_kernel_matrix(feature_space, reference_features) scheduler_time = time.time() - initial_time h, m, s = convert_elapsed_time(scheduler_time) logger.info(" {} kernel evaluations added in {} hours {} minutes " "{:.2f} seconds.".format(len(computations), h, m, s)) if self.batch_size is not None: computations = list(get_chunks(computations, self.batch_size)) logger.info( " The calculations were batched in groups of {}.".format( self.batch_size)) # We compute the calculations with dask and the result is converted # to numpy array. logger.info(" Evaluating atomic similarities...") if self.batch_size is None: kernel_matrix = dask.compute(*computations, scheduler=self.scheduler) else: kernel_matrix = [] for i, chunk in enumerate(computations): kernel_matrix.append( dask.compute(*chunk, scheduler=self.scheduler)) self.K = np.array(kernel_matrix).reshape(dim, dim) build_time = time.time() - initial_time h, m, s = convert_elapsed_time(build_time) logger.info("Kernel matrix built in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s)) """ LT Vectors """ # We build the LT matrix needed for ADA logger.info("Building LT matrix") computations = [] for index, feature_space in enumerate(feature_space.items()): computations.append(self.get_lt(index)) self.LT = np.array((dask.compute(*computations, scheduler=self.scheduler))) lt_time = time.time() - initial_time h, m, s = convert_elapsed_time(lt_time) logger.info( "LT matrix built in {} hours {} minutes {:.2f} seconds.".format( h, m, s))
def get_kernel_matrix(self, feature_space, reference_features, purpose): """Get kernel matrix delayed computations Parameters ---------- features : dict, list Dictionary with hash and features, or a list. reference_space : array Array with reference feature space. purpose : str Purpose of this kernel matrix. Accepted arguments are 'training', and 'inference'. Returns ------- kernel_matrix List with kernel matrix values. Notes ----- This class method expects the feature_space to be an OrderedDict and reference_space but it turns out that for computing variances, it might be the case the feature_space is also a list. """ call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf} initial_time = time.time() if isinstance(reference_features, dict): # This is the case when the reference_features are a # dictionary, too. If that's true we have to convert it to a list. reference_features = list(reference_features.values())[0] chunks = list(get_chunks(feature_space, self.batch_size)) logger.info( " The calculations are distributed in {} batches of {} atoms.". format(len(chunks), self.batch_size)) counter = 0 kernel_matrix = [] for c, chunk in enumerate(chunks): chunk_initial_time = time.time() logger.info( " Computing kernel functions for chunk {}...".format(c)) intermediates = [] if isinstance(feature_space, dict) and isinstance( reference_features, list): if isinstance(chunk, dict) is False: chunk = OrderedDict(chunk) reference_lenght = len(reference_features) for hash, _feature_space in chunk.items(): f_map = [] for i_symbol, i_afp in _feature_space: i_symbol = decode(i_symbol) f_map.append(1) if purpose == "training": for j in range(counter, reference_lenght): j_symbol, j_afp = reference_features[j] kernel = call[self.kernel](i_afp, j_afp, i_symbol, j_symbol, self.sigma) intermediates.append(kernel) counter += 1 else: for j_symbol, j_afp in reference_features: j_symbol = decode(j_symbol) kernel = call[self.kernel](i_afp, j_afp, i_symbol, j_symbol, self.sigma) intermediates.append(kernel) self.fingerprint_map.append(f_map) elif isinstance(feature_space, list) and isinstance( reference_features, list): for i_symbol, i_afp in chunk: for j_symbol, j_afp in reference_features: i_symbol = decode(i_symbol) j_symbol = decode(j_symbol) kernel = call[self.kernel](i_afp, j_afp, i_symbol, j_symbol, self.sigma) intermediates.append(kernel) # Compute stuff from above kernel_matrix += dask.compute(intermediates, scheduler=self.scheduler)[0] del intermediates chunk_final_time = time.time() - chunk_initial_time h, m, s = convert_elapsed_time(chunk_final_time) logger.info(" ...finished in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s)) # dask.distributed.wait(kernel_matrix) del reference_features # kernel_matrix = client.gather(kernel_matrix) build_time = time.time() - initial_time h, m, s = convert_elapsed_time(build_time) logger.info("Kernel matrix built in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s)) """ LT Vectors """ # We build the LT matrix needed for ADA if purpose == "training": self.LT = [] logger.info("Building LT matrix") computations = [] for index, feature_space in enumerate(feature_space.items()): computations.append(self.get_lt(index)) computations = list(get_chunks(computations, self.batch_size)) logger.info( " The calculations are distributed in {} batches of {} molecules." .format(len(computations), self.batch_size)) for chunk in computations: self.LT += dask.compute(*chunk, scheduler=self.scheduler) self.LT = np.array(self.LT) del computations del chunk lt_time = time.time() - initial_time h, m, s = convert_elapsed_time(lt_time) logger.info( "LT matrix built in {} hours {} minutes {:.2f} seconds.". format(h, m, s)) return kernel_matrix