def get_skorch_regressor(): X, y = make_regression(100, 5, n_informative=3, random_state=0) X = X.astype(np.float32) y = y / np.std(y) y = y.reshape(-1, 1).astype(np.float32) X_df = pd.DataFrame(X, columns=['col' + str(i) for i in range(X.shape[1])]) class MyModule(nn.Module): def __init__(self, input_units=5, num_units=5, nonlin=nn.ReLU()): super(MyModule, self).__init__() self.dense0 = nn.Linear(input_units, num_units) self.nonlin = nonlin self.dense1 = nn.Linear(num_units, num_units) self.output = nn.Linear(num_units, 1) def forward(self, X, **kwargs): X = self.nonlin(self.dense0(X)) X = self.nonlin(self.dense1(X)) X = self.output(X) return X model = NeuralNetRegressor( MyModule, max_epochs=20, lr=0.2, iterator_train__shuffle=True, ) model.fit(X_df.values, y) return model, X_df, y
def fit_custom_pytorch_module_w_skorch(module, X, y, hyperparams): """Fit a custom PyTorch module using Skorch.""" skorch_net = NeuralNetRegressor( module=module, optimizer=torch.optim.Adam, lr=hyperparams["learning_rate"], optimizer__weight_decay=hyperparams["l2_decay"], max_epochs=hyperparams["max_epochs"], batch_size=hyperparams["batch_size"], iterator_train__shuffle=True, ) skorch_net.fit(X, y) return skorch_net
def optimize(model): logger.info("Checkpoint2") X = model.predictor_src #+ self.predictor_tgt y = model.predictor_tgt # y = model.config.sentence_level print(X) print(y) #Hyperparameter Tuning with Random Search net = NeuralNetRegressor( model, max_epochs=10, lr=0.1, # Shuffle training data on each epoch iterator_train__shuffle=True, ) net.fit(X, y) y_proba = net.predict_proba(X) # deactivate skorch-internal train-valid split and verbose logging net.set_params(train_split=False, verbose=0) params = { 'epochs': [7], 'hidden_LSTM': [32, 64, 128], 'learning_rate_batch': [(32, '1e-3'), (64, '2e-3')], 'dropout': [0.5], } gs = RandomizedSearchCV(net, params, refit=False, cv=3, scoring='accuracy', verbose=2) gs.fit(X, y) print("best score: {:.3f}, best params: {}".format( gs.best_score_, gs.best_params_)) return
def train_nn_model_validate1(nodes, X_train_scaled, Y_train, max_evals=10): #我觉得0.12的设置有点多了,还有很多数据没用到呢,感觉这样子设置应该会好一些的吧? #X_split_train, X_split_test, Y_split_train, Y_split_test = train_test_split(X_train_scaled, Y_train, test_size=0.12, stratify=Y_train) X_split_train, X_split_test, Y_split_train, Y_split_test = train_test_split( X_train_scaled, Y_train, test_size=0.14) #由于神经网络模型初始化、dropout等的问题导致网络不够稳定 #解决这个问题的办法就是多重复计算几次,选择其中靠谱的模型 best_rmse = 99999999999.9 best_model = 0.0 for j in range(0, max_evals): rsg = NeuralNetRegressor( lr=nodes["lr"], optimizer__weight_decay=nodes["optimizer__weight_decay"], criterion=nodes["criterion"], batch_size=nodes["batch_size"], optimizer__betas=nodes["optimizer__betas"], module=create_nn_module(nodes["input_nodes"], nodes["hidden_layers"], nodes["hidden_nodes"], nodes["output_nodes"], nodes["percentage"]), max_epochs=nodes["max_epochs"], callbacks=[ skorch.callbacks.EarlyStopping(patience=nodes["patience"]) ], device=nodes["device"], optimizer=nodes["optimizer"]) init_module(rsg.module, nodes["weight_mode"], nodes["bias"]) rsg.fit(X_split_train.astype(np.float32), Y_split_train.astype(np.float32)) #Y_pred = rsg.predict(X_split_test.astype(np.float32)) metric = cal_nnrsg_rmse(rsg, X_split_test, Y_split_test) best_model, best_rmse, flag = record_best_model_rmse( rsg, metric, best_model, best_rmse) return best_model, best_rmse
iterator_train__collate_fn=collate_pool, iterator_train__shuffle=True, iterator_valid__pin_memory=True, iterator_valid__num_workers=0, iterator_valid__collate_fn=collate_pool, iterator_valid__shuffle=False, device=device, criterion=torch.nn.L1Loss, dataset=MergeDataset, callbacks=[cp, load_best_valid_loss, LR_schedule]) # Assign everything to their respective dictionaries nets[ads] = net cps[ads] = cp best_finders[ads] = load_best_valid_loss lr_schedulers[ads] = LR_schedule # Block the data for ads, net in nets.items(): _sdts_train = [] _targets_train = [] for doc, sdt, target in zip(docs_train, sdts_train, targets_train): if doc['adsorbate'] == ads: _sdts_train.append(sdt) _targets_train.append(target) _targets_train = np.array(_targets_train) # Fit net.initialize() net.fit(_sdts_train, _targets_train)
class PyTorchModel(BaseModel): def build_model( self, network=MVRegressor, device: str = "cpu", scale_data: bool = False, num_layers: int = 10, num_units: int = 50, dropout: float = 0.5, num_epochs: int = 10, batch_size: int = 128, ): self.scale_data = scale_data self.num_layers = num_layers self.num_units = num_units self.dropout = dropout self.num_epochs = num_epochs self.batch_size = batch_size if not all([hasattr(self, "input_dim"), hasattr(self, "output_dim")]): raise ValueError( "Please load dataset first to obtain proper sizes") if device == "cpu": self.device = device else: use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.model = NeuralNetRegressor( network, device=self.device, module__input_dim=self.input_dim, module__output_dim=self.output_dim, module__n_layers=self.num_layers, module__num_units=self.num_units, module__p_dropout=self.dropout, max_epochs=self.num_epochs, criterion=nn.MSELoss, batch_size=self.batch_size, # Shuffle training data on each epoch iterator_train__shuffle=True, callbacks=[ ( "lr_scheduler", LRScheduler(policy=CyclicLR, base_lr=0.001, max_lr=0.01, step_every="batch"), ), ], ) def fit(self, X, y, **fit_params): if self.scale_data: X, y = self.scalar(X, y) X, y = ( torch.tensor(X).float().to(device=self.device), torch.tensor(y).float().to(device=self.device), ) self.model.fit(X, y, **fit_params) def load_model( self, input_dim: str, output_dim: str, filename: str, scale_data: bool = False, ): self.scale_data = scale_data self.input_dim = input_dim self.output_dim = output_dim self.build_model(scale_data=scale_data) self.model = pickle.load(open(filename, "rb")) def predict(self, X): if self.scale_data: X = self.xscalar.transform(X) X = torch.tensor(X).float().to(device=self.device) preds = self.model.predict(X) if self.scale_data: preds = self.yscalar.inverse_transform(preds) return preds def sweep( self, params: Dict, X, y, search_algorithm: str = "bayesian", num_trials: int = 3, scoring_func: str = "r2", early_stopping: bool = False, results_csv_path: str = "outputs/results.csv", splitting_criteria: str = "timeseries", num_splits: int = 5, ): start_dir = str(pathlib.Path(os.getcwd()).parent) module_dir = str(pathlib.Path(__file__).parent) # temporarily change directory to file directory and then reset os.chdir(module_dir) if self.scale_data: X, y = self.scalar(X, y) X, y = ( torch.tensor(X).float().to(device=self.device), torch.tensor(y).float().to(device=self.device), ) if splitting_criteria.lower() == "cv": cv = None elif splitting_criteria.lower() == "timeseries": cv = TimeSeriesSplit(n_splits=num_splits) elif splitting_criteria.lower() == "grouped": cv = GroupShuffleSplit(n_splits=num_splits) elif splitting_criteria.lower() == "fixed": if type(test_indices) != list: raise ValueError( "fixed split used but no test-indices provided...") cv = PredefinedSplit(test_fold=test_indices) else: raise ValueError( "Unknowing splitting criteria provided: {splitting_criteria}, should be one of [cv, timeseries, grouped]" ) if search_algorithm.lower() == "bohb": early_stopping = True if any([ search_algorithm.lower() in ["bohb", "bayesian", "hyperopt", "optuna"] ]): search = TuneSearchCV( self.model, params, search_optimization=search_algorithm, n_trials=num_trials, early_stopping=early_stopping, scoring=scoring_func, ) elif search_algorithm == "grid": search = GridSearchCV( self.model, param_grid=params, refit=True, cv=num_trials, scoring=scoring_func, ) elif search_algorithm == "random": search = RandomizedSearchCV( self.model, param_distributions=params, refit=True, cv=num_trials, scoring=scoring_func, ) else: raise NotImplementedError( "Search algorithm should be one of grid, hyperopt, bohb, optuna, bayesian, or random" ) with mlflow.start_run() as run: search.fit(X, y) self.model = search.best_estimator_ # set path back to initial os.chdir(start_dir) results_df = pd.DataFrame(search.cv_results_) logger.info(f"Best hyperparams: {search.best_params_}") if not pathlib.Path(results_csv_path).parent.exists(): pathlib.Path(results_csv_path).parent.mkdir(exist_ok=True, parents=True) logger.info(f"Saving sweeping results to {results_csv_path}") logger.info(f"Best score: {search.best_score_}") results_df.to_csv(results_csv_path) cols_keep = [col for col in results_df if "param_" in col] cols_keep += ["mean_test_score"] results_df = results_df[cols_keep] return results_df
device = 'cpu' # device = 'cuda:0' net = NeuralNetRegressor( module=FullNN(unique_atoms, [fp_length, 5, 5], device, forcetraining=forcetraining), criterion=CustomLoss, criterion__force_coefficient=0.3, optimizer=torch.optim.LBFGS, lr=1, batch_size=400, max_epochs=50, iterator_train__collate_fn=collate_amp, iterator_valid__collate_fn=collate_amp, device=device, train_split=None, callbacks=[ EpochScoring(forces_score, on_train=True, use_caching=True, target_extractor=target_extractor), EpochScoring(energy_score, on_train=True, use_caching=True, target_extractor=target_extractor) ], ) net.fit(data, None)
early = EarlyStopping(patience=args.patience, threshold=args.threshold) #Using the model with the NeuralNetRegressor to configure parameters net = NeuralNetRegressor(model, max_epochs=args.epochs, lr=args.lr, batch_size=args.batch_size, optimizer__momentum=args.momentum, iterator_train__shuffle=False, iterator_valid__shuffle=False #callbacks=[early] ) start_training = time.time() net.fit(X_train, y_train) #saving the training time b = open("train_temps.txt", "a+") b.write("Iteration: " + str(number) + '\n') b.write("Lenght X: " + str(len(X)) + '\n') b.write("Lenght X train: " + str(len(X_train)) + '\n') b.write("Lenght X test: " + str(len(X_test)) + '\n') b.write(" Time to train: " + str(secs2hours(time.time() - start_training)) + '\n') b.write(" Time to train: " + str(time.time() - start_training) + '\n') b.close() # visualize the loss as the network trained # plotting training and validation loss epochs = [i for i in range(len(net.history))]
optimizer__momentum=0.9, optimizer__weight_decay=0.001, iterator_train__shuffle=True, iterator_train__num_workers=10, iterator_valid__shuffle=True, iterator_valid__num_workers=10, train_split=predefined_split(valid0df), device='cuda:0,1,6') rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1])) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4)) print("Fitting") net.fit(train0df, y=None) print("Fit completed") history = net.history train_loss0 = history[:, 'train_loss'] valid_loss0 = history[:, 'valid_loss'] ax1.plot(train_loss0) ax1.plot(valid_loss0) ax1.legend(['train_loss', 'valid_loss']) net.save_params(f_params='dcs0_0005.pkl', f_optimizer='dcs0_0005_optimizer.pkl', f_history='dcs0_0005_history.json') pred = net.predict_proba(valid0) label = valid0.get_label() accuracy = concordance_index(pred, label)
out = self.l1(out) out = self.fc2(out) return out net_regr = NeuralNetRegressor( Net(hidden_size=500), max_epochs=5000, lr=0.01, device='cuda', optimizer=torch.optim.Adam, train_split=None, verbose=1, ) res = net_regr.fit(t_d_inp, t_d_oup) # save net_regr.save_params(f_params='step1result') pred = net_regr.predict(test_inp) mse = ((test_oup - pred)**2).mean() print('test error = ' + str(mse)) # plot 1 loss loss = net_regr.history[:, 'train_loss'] plt.figure() plt.plot(loss) plt.ylabel('loss') plt.ylim([0, loss[-1] * 4]) # plot 2 plt.figure() s = 3
max_epochs=10, lr=0.1, verbose=1, ) # =================Split the dataset using GroupShuffleSplit=================== gss = GroupShuffleSplit(n_splits=10, test_size=0.15, random_state=42) gss.get_n_splits(X_regr, y_regr, groups=y) for train_index, test_index in (gss.split(X_regr, y_regr, groups=y)): print("TRAIN:", train_index, "TEST:", test_index) X_regr_train, X_regr_test = X_regr[train_index], X_regr[test_index] y_regr_train, y_regr_test = y_regr[train_index], y_regr[test_index] # ==============================Train the model================================ net_regr.fit(X_regr_train, y_regr_train) # ================Test/Validate the model via 10fold crossValidation=========== y_regr_pred = cross_val_predict(net_regr, X_regr_test, y_regr_test, cv=10) # ============================ Evaluate the model ============================= score_regression = r2_score(y_regr_test, y_regr_pred) score_r2 = pearsonr(y_regr_test.flatten(), y_regr_pred.flatten())[0]**2 rmse = sqrt(mean_squared_error(y_regr_test, y_regr_pred)) #score_regression=mean_squared_error(y_regr_pred, y_regr_test) accuracy_regression.append(score_regression) accuracy_score_r2.append(score_r2) accuracy_rmse.append(rmse) print(accuracy_regression) print(score_r2)
iterator_valid__num_workers=0, iterator_valid__collate_fn=collate_pool, iterator_valid__shuffle=False, device=device, criterion=torch.nn.L1Loss, dataset=MergeDataset, callbacks=[cp, load_best_valid_loss, LR_schedule]) # # Training # We can train a new model... # In[ ]: net.initialize() net.fit(sdts_train, targets_train) # ...or load whatever is cached # In[ ]: net.initialize() net.load_params(f_history='valid_best_history.json', f_optimizer='valid_best_optimizer.pt', f_params='valid_best_params.pt') # # Assess performance # In[ ]: import numpy as np
): super(RegressorModule, self).__init__() self.num_units = num_units self.nonlin = nonlin self.dense0 = nn.Linear(20, num_units) self.nonlin = nonlin self.dense1 = nn.Linear(num_units, 10) self.output = nn.Linear(10, 1) def forward(self, X, **kwargs): X = self.nonlin(self.dense0(X)) X = F.relu(self.dense1(X)) X = self.output(X) return X net_regr = NeuralNetRegressor( RegressorModule, max_epochs=20, lr=0.1, device='cuda', ) net_regr.fit(X_regr, y_regr) y_pred = net_regr.predict(X_regr[:5]) y_pred a, b = net_regr.train_split(X_regr)
net_regr = NeuralNetRegressor( Net(hidden_size=500, input_size=x_train.shape[1], output_size=y_train.shape[1]), criterion=torch.nn.MSELoss, optimizer=torch.optim.Adam, max_epochs=5000, lr=0.001, device='cuda', train_split=None, verbose=1, batch_size=-1, ) res = net_regr.fit(x_train, y_train) # save net_regr.save_params(f_params='step1result') #model = Net(input_size, hidden_size, output_size, dropout_rate) ## print model summary #nodata = np.prod(x_train.shape) #noparas = sum([param.nelement() for param in model.parameters()]) #print("Total number of data elements:"+str(nodata)) #print("Total number of parameters :"+str(noparas)) #for name, param in model.named_parameters(): # print(name, "\t", param.nelement(), "\t\t", param.data.shape) #if noparas>nodata: # print("Use too much neurons!!!") #else: # print("Network is OK!")
module__n_h=4, optimizer=Adam, iterator_train__pin_memory=True, iterator_train__num_workers=0, iterator_train__collate_fn=collate_pool, iterator_train__shuffle=True, iterator_valid__pin_memory=True, iterator_valid__num_workers=0, iterator_valid__collate_fn=collate_pool, iterator_valid__shuffle=False, device=device, criterion=torch.nn.L1Loss, dataset=MergeDataset, callbacks=[cp, load_best_valid_loss, LR_schedule]) net.initialize() net.fit(stds_train_, targets_train_) nets.append(net) # # Loading models # It takes a few hours to fit the 5-model ensemble. You can either do it via notebook (above) or via `sbatch submit_ensemble_fitting.sh`. Either way, you load the results here. # In[5]: import numpy as np from sklearn.model_selection import KFold from torch.optim import Adam import skorch.callbacks.base from skorch.callbacks import Checkpoint # needs skorch >= 0.4 from skorch.callbacks.lr_scheduler import LRScheduler from skorch import NeuralNetRegressor from cgcnn.dropoutmodel10 import CrystalGraphConvNet
class PyTorchModel(BaseModel): def build_model( self, network=MVRegressor, device: str = "cpu", scale_data: bool = False, num_layers: int = 10, num_units: int = 50, dropout: float = 0.5, num_epochs: int = 10, batch_size: int = 128, ): self.scale_data = scale_data self.num_layers = num_layers self.num_units = num_units self.dropout = dropout self.num_epochs = num_epochs self.batch_size = batch_size if not all([hasattr(self, "input_dim"), hasattr(self, "output_dim")]): raise ValueError("Please load dataset first to obtain proper sizes") if device == "cpu": self.device = device else: use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.model = NeuralNetRegressor( network, device=self.device, module__input_dim=self.input_dim, module__output_dim=self.output_dim, module__n_layers=self.num_layers, module__num_units=self.num_units, module__p_dropout=self.dropout, max_epochs=self.num_epochs, criterion=nn.MSELoss, batch_size=self.batch_size, # Shuffle training data on each epoch iterator_train__shuffle=True, callbacks=[ ( "lr_scheduler", LRScheduler( policy=CyclicLR, base_lr=0.001, max_lr=0.01, step_every="batch" ), ), ], ) def fit(self, X, y, **fit_params): if self.scale_data: X, y = self.scalar(X, y) X, y = ( torch.tensor(X).float().to(device=self.device), torch.tensor(y).float().to(device=self.device), ) self.model.fit(X, y, **fit_params) def load_model( self, input_dim: str, output_dim: str, filename: str, scale_data: bool = False, ): self.scale_data = scale_data self.input_dim = input_dim self.output_dim = output_dim self.build_model() self.model = pickle.load(open(filename, "rb")) def predict(self, X): if self.scale_data: X = self.xscalar.transform(X) X = torch.tensor(X).float().to(device=self.device) preds = self.model.predict(X) if self.scale_data: preds = self.yscalar.inverse_transform(preds) return preds def sweep( self, params: Dict, X, y, search_algorithm: str = "bayesian", num_trials: int = 3, scoring_func: str = "r2", ): from tune_sklearn import TuneGridSearchCV, TuneSearchCV X, y = ( torch.tensor(X).float().to(device=self.device), torch.tensor(y).float().to(device=self.device), ) tune_search = TuneSearchCV( self.model, params, search_optimization=search_algorithm, n_trials=num_trials, early_stopping=True, scoring=scoring_func, ) tune_search.fit(X, y) return tune_search
TRAIN_DIR = os.path.join(os.path.abspath("."), "..", "cropped_data") cp = Checkpoint(dirname='segnet_mse_no_sigmoid_sgd_150ep_b8_lr_0.01_30enc/checkpoints') train_end_cp = TrainEndCheckpoint(dirname='segnet_mse_no_sigmoid_sgd_150ep_b8_lr_0.01_30enc/checkpoints') load_state = LoadInitState(checkpoint=cp) net = NeuralNetRegressor( SegNet, module__encoding_size=30, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), max_epochs=150, batch_size=8, criterion=MSELoss, lr=0.01, iterator_train__shuffle=True, optimizer=torch.optim.SGD, optimizer__momentum=.9, callbacks=[cp, train_end_cp, load_state] ) if __name__ == '__main__': mean = np.array([0.5020, 0.4690, 0.4199]) std = np.array([0.2052, 0.2005, 0.1966]) aug_tran = transforms.Compose([ transforms.Resize(SIZE, interpolation=3), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ]) dataset = AutoEncoderImageDataset(TRAIN_DIR, transform=aug_tran) net.fit(dataset, y=None)
class AtomsTrainer: def __init__(self, config): self.config = config self.pretrained = False def load(self): self.load_config() self.load_rng_seed() self.load_dataset() self.load_model() self.load_criterion() self.load_optimizer() self.load_logger() self.load_extras() self.load_skorch() def load_config(self): self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") self.identifier = self.config["cmd"].get("identifier", False) if self.identifier: self.identifier = self.timestamp + "-{}".format(self.identifier) else: self.identifier = self.timestamp self.device = torch.device(self.config["optim"].get("device", "cpu")) self.debug = self.config["cmd"].get("debug", False) run_dir = self.config["cmd"].get("run_dir", "./") os.chdir(run_dir) if not self.debug: self.cp_dir = os.path.join(run_dir, "checkpoints", self.identifier) print(f"Results saved to {self.cp_dir}") os.makedirs(self.cp_dir, exist_ok=True) def load_rng_seed(self): seed = self.config["cmd"].get("seed", 0) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def get_unique_elements(self, training_images): elements = np.array( [atom.symbol for atoms in training_images for atom in atoms]) elements = np.unique(elements) return elements def load_dataset(self): training_images = self.config["dataset"]["raw_data"] # TODO: Scalability when dataset to large to fit into memory if isinstance(training_images, str): training_images = ase.io.read(training_images, ":") self.elements = self.config["dataset"].get( "elements", self.get_unique_elements(training_images)) self.forcetraining = self.config["model"].get("get_forces", True) self.fp_scheme = self.config["dataset"].get("fp_scheme", "gaussian").lower() self.fp_params = self.config["dataset"]["fp_params"] self.cutoff_params = self.config["dataset"].get( "cutoff_params", {"cutoff_func": "Cosine"}) self.train_dataset = AtomsDataset( images=training_images, descriptor_setup=( self.fp_scheme, self.fp_params, self.cutoff_params, self.elements, ), forcetraining=self.forcetraining, save_fps=self.config["dataset"].get("save_fps", True), ) self.target_scaler = self.train_dataset.target_scaler if not self.debug: normalizers = {"target": self.target_scaler} torch.save(normalizers, os.path.join(self.cp_dir, "normalizers.pt")) self.input_dim = self.train_dataset.input_dim self.val_split = self.config["dataset"].get("val_split", 0) print("Loading dataset: {} images".format(len(self.train_dataset))) def load_model(self): elements = list_symbols_to_indices(self.elements) self.model = BPNN(elements=elements, input_dim=self.input_dim, **self.config["model"]) print("Loading model: {} parameters".format(self.model.num_params)) def load_extras(self): callbacks = [] load_best_loss = train_end_load_best_loss(self.identifier) self.split = CVSplit(cv=self.val_split) if self.val_split != 0 else 0 metrics = evaluator( self.val_split, self.config["optim"].get("metric", "mae"), self.identifier, self.forcetraining, ) callbacks.extend(metrics) if not self.debug: callbacks.append(load_best_loss) scheduler = self.config["optim"].get("scheduler", None) if scheduler: scheduler = LRScheduler(scheduler, **self.config["optim"]["scheduler_params"]) callbacks.append(scheduler) if self.config["cmd"].get("logger", False): from skorch.callbacks import WandbLogger callbacks.append( WandbLogger( self.wandb_run, save_model=False, keys_ignored="dur", )) self.callbacks = callbacks def load_criterion(self): self.criterion = self.config["optim"].get("loss_fn", CustomLoss) def load_optimizer(self): self.optimizer = self.config["optim"].get("optimizer", torch.optim.Adam) def load_logger(self): if self.config["cmd"].get("logger", False): import wandb self.wandb_run = wandb.init( name=self.identifier, config=self.config, id=self.timestamp, ) def load_skorch(self): skorch.net.to_tensor = to_tensor collate_fn = DataCollater(train=True, forcetraining=self.forcetraining) self.net = NeuralNetRegressor( module=self.model, criterion=self.criterion, criterion__force_coefficient=self.config["optim"].get( "force_coefficient", 0), criterion__loss=self.config["optim"].get("loss", "mse"), optimizer=self.optimizer, lr=self.config["optim"].get("lr", 1e-1), batch_size=self.config["optim"].get("batch_size", 32), max_epochs=self.config["optim"].get("epochs", 100), iterator_train__collate_fn=collate_fn, iterator_train__shuffle=True, iterator_valid__collate_fn=collate_fn, iterator_valid__shuffle=False, device=self.device, train_split=self.split, callbacks=self.callbacks, verbose=self.config["cmd"].get("verbose", True), ) print("Loading skorch trainer") def train(self, raw_data=None): if raw_data is not None: self.config["dataset"]["raw_data"] = raw_data if not self.pretrained: self.load() self.net.fit(self.train_dataset, None) def predict(self, images, batch_size=32): if len(images) < 1: warnings.warn("No images found!", stacklevel=2) return images a2d = AtomsToData( descriptor=self.train_dataset.descriptor, r_energy=False, r_forces=False, save_fps=True, fprimes=self.forcetraining, cores=1, ) data_list = a2d.convert_all(images, disable_tqdm=True) self.net.module.eval() collate_fn = DataCollater(train=False, forcetraining=self.forcetraining) predictions = {"energy": [], "forces": []} for data in data_list: collated = collate_fn([data]) energy, forces = self.net.module(collated) energy = self.target_scaler.denorm( energy, pred="energy").detach().tolist() forces = self.target_scaler.denorm(forces, pred="forces").detach().numpy() predictions["energy"].extend(energy) predictions["forces"].append(forces) return predictions def load_pretrained(self, checkpoint_path=None): print(f"Loading checkpoint from {checkpoint_path}") self.load() self.net.initialize() self.pretrained = True try: self.net.load_params( f_params=os.path.join(checkpoint_path, "params.pt"), f_optimizer=os.path.join(checkpoint_path, "optimizer.pt"), f_criterion=os.path.join(checkpoint_path, "criterion.pt"), f_history=os.path.join(checkpoint_path, "history.json"), ) # TODO(mshuaibi): remove dataset load, use saved normalizers except NotImplementedError: print("Unable to load checkpoint!")
class AtomsTrainer: def __init__(self, config={}): self.config = config self.pretrained = False def load(self, load_dataset=True): self.load_config() self.load_rng_seed() if load_dataset: self.load_dataset() self.load_model() self.load_criterion() self.load_optimizer() self.load_logger() self.load_extras() self.load_skorch() def load_config(self): dtype = self.config["cmd"].get("dtype", torch.FloatTensor) torch.set_default_tensor_type(dtype) self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") self.identifier = self.config["cmd"].get("identifier", False) if self.identifier: self.identifier = self.timestamp + "-{}".format(self.identifier) else: self.identifier = self.timestamp self.gpus = self.config["optim"].get("gpus", 0) if self.gpus > 0: self.output_device = 0 self.device = f"cuda:{self.output_device}" else: self.device = "cpu" self.output_device = -1 self.debug = self.config["cmd"].get("debug", False) run_dir = self.config["cmd"].get("run_dir", "./") os.chdir(run_dir) if not self.debug: self.cp_dir = os.path.join(run_dir, "checkpoints", self.identifier) print(f"Results saved to {self.cp_dir}") os.makedirs(self.cp_dir, exist_ok=True) def load_rng_seed(self): seed = self.config["cmd"].get("seed", 0) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def get_unique_elements(self, training_images): elements = np.array( [atom.symbol for atoms in training_images for atom in atoms] ) elements = np.unique(elements) return elements def load_dataset(self): training_images = self.config["dataset"]["raw_data"] # TODO: Scalability when dataset to large to fit into memory if isinstance(training_images, str): training_images = ase.io.read(training_images, ":") del self.config["dataset"]["raw_data"] self.elements = self.config["dataset"].get( "elements", self.get_unique_elements(training_images) ) self.forcetraining = self.config["model"].get("get_forces", True) self.fp_scheme = self.config["dataset"].get("fp_scheme", "gaussian").lower() self.fp_params = self.config["dataset"]["fp_params"] self.save_fps = self.config["dataset"].get("save_fps", True) self.cutoff_params = self.config["dataset"].get( "cutoff_params", {"cutoff_func": "Cosine"} ) descriptor_setup = ( self.fp_scheme, self.fp_params, self.cutoff_params, self.elements, ) self.train_dataset = AtomsDataset( images=training_images, descriptor_setup=descriptor_setup, forcetraining=self.forcetraining, save_fps=self.config["dataset"].get("save_fps", True), scaling=self.config["dataset"].get( "scaling", {"type": "normalize", "range": (0, 1)} ), ) self.feature_scaler = self.train_dataset.feature_scaler self.target_scaler = self.train_dataset.target_scaler self.input_dim = self.train_dataset.input_dim self.val_split = self.config["dataset"].get("val_split", 0) self.config["dataset"]["descriptor"] = descriptor_setup if not self.debug: normalizers = { "target": self.target_scaler, "feature": self.feature_scaler, } torch.save(normalizers, os.path.join(self.cp_dir, "normalizers.pt")) # clean/organize config self.config["dataset"]["fp_length"] = self.input_dim torch.save(self.config, os.path.join(self.cp_dir, "config.pt")) print("Loading dataset: {} images".format(len(self.train_dataset))) def load_model(self): elements = list_symbols_to_indices(self.elements) self.model = BPNN( elements=elements, input_dim=self.input_dim, **self.config["model"] ) print("Loading model: {} parameters".format(self.model.num_params)) self.forcetraining = self.config["model"].get("get_forces", True) collate_fn = DataCollater(train=True, forcetraining=self.forcetraining) self.parallel_collater = ParallelCollater(self.gpus, collate_fn) if self.gpus > 0: self.model = DataParallel( self.model, output_device=self.output_device, num_gpus=self.gpus, ) def load_extras(self): callbacks = [] load_best_loss = train_end_load_best_loss(self.identifier) self.val_split = self.config["dataset"].get("val_split", 0) self.split = CVSplit(cv=self.val_split) if self.val_split != 0 else 0 metrics = evaluator( self.val_split, self.config["optim"].get("metric", "mae"), self.identifier, self.forcetraining, ) callbacks.extend(metrics) if not self.debug: callbacks.append(load_best_loss) scheduler = self.config["optim"].get("scheduler", None) if scheduler: scheduler = LRScheduler(scheduler["policy"], **scheduler["params"]) callbacks.append(scheduler) if self.config["cmd"].get("logger", False): from skorch.callbacks import WandbLogger callbacks.append( WandbLogger( self.wandb_run, save_model=False, keys_ignored="dur", ) ) self.callbacks = callbacks def load_criterion(self): self.criterion = self.config["optim"].get("loss_fn", CustomLoss) def load_optimizer(self): self.optimizer = { "optimizer": self.config["optim"].get("optimizer", torch.optim.Adam) } optimizer_args = self.config["optim"].get("optimizer_args", False) if optimizer_args: self.optimizer.update(optimizer_args) def load_logger(self): if self.config["cmd"].get("logger", False): import wandb self.wandb_run = wandb.init( name=self.identifier, config=self.config, ) def load_skorch(self): skorch.net.to_tensor = to_tensor self.net = NeuralNetRegressor( module=self.model, criterion=self.criterion, criterion__force_coefficient=self.config["optim"].get( "force_coefficient", 0 ), criterion__loss=self.config["optim"].get("loss", "mse"), lr=self.config["optim"].get("lr", 1e-1), batch_size=self.config["optim"].get("batch_size", 32), max_epochs=self.config["optim"].get("epochs", 100), iterator_train__collate_fn=self.parallel_collater, iterator_train__shuffle=True, iterator_train__pin_memory=True, iterator_valid__collate_fn=self.parallel_collater, iterator_valid__shuffle=False, iterator_valid__pin_memory=True, device=self.device, train_split=self.split, callbacks=self.callbacks, verbose=self.config["cmd"].get("verbose", True), **self.optimizer, ) print("Loading skorch trainer") def train(self, raw_data=None): if raw_data is not None: self.config["dataset"]["raw_data"] = raw_data if not self.pretrained: self.load() stime = time.time() self.net.fit(self.train_dataset, None) elapsed_time = time.time() - stime print(f"Training completed in {elapsed_time}s") def predict(self, images, disable_tqdm=True): if len(images) < 1: warnings.warn("No images found!", stacklevel=2) return images self.descriptor = construct_descriptor(self.config["dataset"]["descriptor"]) a2d = AtomsToData( descriptor=self.descriptor, r_energy=False, r_forces=False, save_fps=self.config["dataset"].get("save_fps", True), fprimes=self.forcetraining, cores=1, ) data_list = a2d.convert_all(images, disable_tqdm=disable_tqdm) self.feature_scaler.norm(data_list, disable_tqdm=disable_tqdm) self.net.module.eval() collate_fn = DataCollater(train=False, forcetraining=self.forcetraining) predictions = {"energy": [], "forces": []} for data in data_list: collated = collate_fn([data]).to(self.device) energy, forces = self.net.module([collated]) energy = self.target_scaler.denorm( energy.detach().cpu(), pred="energy" ).tolist() forces = self.target_scaler.denorm( forces.detach().cpu(), pred="forces" ).numpy() predictions["energy"].extend(energy) predictions["forces"].append(forces) return predictions def load_pretrained(self, checkpoint_path=None, gpu2cpu=False): """ Args: checkpoint_path: str, Path to checkpoint directory gpu2cpu: bool, True if checkpoint was trained with GPUs and you wish to load on cpu instead. """ self.pretrained = True print(f"Loading checkpoint from {checkpoint_path}") assert os.path.isdir( checkpoint_path ), f"Checkpoint: {checkpoint_path} not found!" if not self.config: # prediction only self.config = torch.load(os.path.join(checkpoint_path, "config.pt")) self.config["cmd"]["debug"] = True self.elements = self.config["dataset"]["descriptor"][-1] self.input_dim = self.config["dataset"]["fp_length"] if gpu2cpu: self.config["optim"]["gpus"] = 0 self.load(load_dataset=False) else: # prediction+retraining self.load(load_dataset=True) self.net.initialize() if gpu2cpu: params_path = os.path.join(checkpoint_path, "params_cpu.pt") if not os.path.exists(params_path): params = torch.load( os.path.join(checkpoint_path, "params.pt"), map_location=torch.device("cpu"), ) new_dict = OrderedDict() for k, v in params.items(): name = k[7:] new_dict[name] = v torch.save(new_dict, params_path) else: params_path = os.path.join(checkpoint_path, "params.pt") try: self.net.load_params( f_params=params_path, f_optimizer=os.path.join(checkpoint_path, "optimizer.pt"), f_criterion=os.path.join(checkpoint_path, "criterion.pt"), f_history=os.path.join(checkpoint_path, "history.json"), ) normalizers = torch.load(os.path.join(checkpoint_path, "normalizers.pt")) self.feature_scaler = normalizers["feature"] self.target_scaler = normalizers["target"] except NotImplementedError: print("Unable to load checkpoint!") def get_calc(self): return AMPtorch(self)