def train(self, path_model, epochs, device="cuda"): self.cnn_model.to(device) optimizer = Adam(self.cnn_model.parameters(), lr=0.001) loss_fn = nn.CrossEntropyLoss() min_loss = None self.cnn_model.train() for epoch in range(epochs): for train_data in self.train_dataset_loader: images, labels = train_data images = images.to(device) labels = labels.to(device) # forward outputs = self.cnn_model(images) loss = loss_fn(outputs, labels) # backward optimizer.zero_grad() loss.backward() optimizer.step() # check if this is the best model so far and store it on disk print("epoch: {}, loss: {}".format(epoch, loss.item())) if not min_loss or loss.item() < min_loss: print("The new best model found!") torch_save(self.cnn_model, path_model) min_loss = loss.item() return self.cnn_model
def save_var_to_checkpoint(filename, name, mask, sd_key): checkpoint = load_checkpoint(filename) sd = checkpoint[sd_key] if sd_key is not None else checkpoint if name not in sd: raise RuntimeError('Variable {} not found in {}'.format(name, filename)) sd[name] = sd[name] * mask torch_save(checkpoint, filename) del checkpoint
def save(self, stats, model): m_value = stats['val']['loss'][-1] if self.save_mode and (not self.last_value or self.last_value > m_value): print("Saving model...") if hasattr(model, 'get_init_params'): with open(self.init_params_name, 'w') as fd: json.dump(model.get_init_params(), fd) torch_save(model.state_dict(), self.m_name) self.last_value = m_value
def save_checkpoint(path, total_epochs, model, loss, environment): make_directories_for_file(path) dictionary = dict() dictionary["total_epochs"] = total_epochs dictionary["model_states"] = model.state_dict() dictionary["loss"] = loss dictionary["environment"] = environment.to_dict() torch_save(dictionary, path) logger.info(f"Saved checkpoint in epoch {total_epochs} to '{path}'.")
def save(cls, weights: dict, path: str): """ Save model state :param weights: state :param path: state file path :return: None """ # Lazy load torch from torch import save as torch_save torch_save(weights, path)
def save() -> None: torch_save( dict( model_state_dict=self.model.state_dict(), optimizer_state_dict=self.optimizer.state_dict(), scheduler_state_dict=self.scheduler.state_dict(), epoch=epoch, iteration=iteration, ), checkpoint_path, )
def validation_phase_ended(self, model, criterion, optimizer, **kwargs): ForwardStatsCallback.validation_phase_ended(self, model, criterion, optimizer, **kwargs) m_value = self.stats['val']['loss'][-1] if self.save_mode and (not self.last_value or self.last_value > m_value): print("Saving model...") if hasattr(model, 'get_init_params'): with open(self.init_params_name, 'w') as fd: json.dump(model.get_init_params(), fd) torch_save(model.state_dict(), self.m_name) self.last_value = m_value
def train_evaluate_trvate(self, train_dl, valid_dl, test_dl, epochCnt=500, saveBestModelName=None): # define the optimization criterion = CrossEntropyLoss() optimizer = SGD(self.parameters(), lr=0.01, momentum=0.9) # enumerate epochs accvectr = np.zeros(epochCnt) accvecva = np.zeros(epochCnt) accvecte = np.zeros(epochCnt) acc_va_max = 0 for epoch in range(epochCnt): # enumerate mini batches self.train() for i, (inputs, targets) in enumerate(train_dl): # clear the gradients optimizer.zero_grad() # compute the model output yhat = self.forward(inputs) # calculate loss loss = criterion(yhat, targets.squeeze_()) # credit assignment loss.backward() # update model weights optimizer.step() acc_tr, _, _ = self.evaluate_model(train_dl) acc_va, _, _ = self.evaluate_model(valid_dl) acc_te, preds_te, labels_te = self.evaluate_model(test_dl) if acc_va_max < acc_va: preds_best, labels_best = preds_te, labels_te print("best validation epoch so far - epoch ", epoch, "va: %.3f" % acc_va, "te: %.3f" % acc_te) acc_va_max = acc_va if saveBestModelName is not None: print("Saving model at : ", saveBestModelName) torch_save(self.state_dict(), saveBestModelName) print("Model saved..") else: print("epoch ", epoch, "tr: %.3f" % acc_tr, "va: %.3f" % acc_va, "te: %.3f" % acc_te) accvectr[epoch] = acc_tr accvecva[epoch] = acc_va accvecte[epoch] = acc_te return accvectr, accvecva, accvecte, preds_best, labels_best
def update_best_model(bmodel, bhist, new_model, new_h, phase="val", metric="loss", ind=-1): if new_model is not None and new_h is not None: if new_h[phase][metric][ind] < bhist[phase][metric][ind]: bmodel = new_model bhist = new_h if hasattr(bmodel, 'get_init_params'): with open('best_model_params.json', 'w') as fd: json.dump(bmodel.get_init_params(), fd) torch_save(bmodel.state_dict(), "best_model.pth") with open('best_history.json', 'w') as fd: json.dump(bhist, fd) return bmodel, bhist
def save_checkpoint( model, infos, optimizer, checkpoint_dir=None, job_id=None, histories=None, append="", ): # # Modify appendage if len(append) > 0: append = "-" + append # # if checkpoint_dir doesn't exist, create it if not isdir(checkpoint_dir): makedirs(checkpoint_dir) # # Set file names checkpoint_path = join(checkpoint_dir, f"model{append}.pth") optimizer_path = join(checkpoint_dir, f"optimizer{append}.pth") infos_path = join(checkpoint_dir, f"infos_{job_id}{append}.pkl") histories_path = join(checkpoint_dir, f"histories_{job_id}{append}.pkl") # # Save checkpoint data print(f"Saving checkpoint to {checkpoint_path}") torch_save(model.state_dict(), checkpoint_path) # # Save optimizer data torch_save(optimizer.state_dict(), optimizer_path) # # Save infos data with open(infos_path, "wb") as f: pickle_dump(infos, f) # # Save histories data if histories is not None: with open(histories_path, "wb") as f: pickle_dump(histories, f)
def _clone_or_save(self, with_data=True, file_path=None, method='torch'): data, o = None, None if not with_data: data = self._pull_data() xtrn = self._pull_xtrn() if file_path: with open(file_path, 'wb') as f: if method == 'pickle': pickle.dump(self, f, 2) elif method == 'torch': torch_save(self, f) else: raise ValueError(f'ERROR: Unknown method "{method}"') else: o = deepcopy(self) if xtrn is not None: self._push_xtrn(xtrn) if o is not None: o._push_xtrn(xtrn) if data: self._push_data(data) return o if o is not None else xtrn
def save_checkpoint(engine, model, optimizer, lr_scheduler, amp, no_checkpoints, checkpoint_directory): step = engine.state.iteration checkpoints = [ int(e.split("/")[-1].split("_")[-1].split(".")[0]) for e in glob(checkpoint_directory + "*.pth") ] checkpoints.sort() if len(checkpoints) > no_checkpoints: os_remove(checkpoint_directory + "checkpoint_" + str(checkpoints[0]) + ".pth") torch_save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'engine': engine.state_dict(), 'amp': amp.state_dict() }, checkpoint_directory + "checkpoint_" + str(step) + ".pth" )
def get_torch_object_bytes(obj): with TemporaryFile() as f: torch_save(obj, f) f.seek(0) b = f.read() return b
def save_model(model, path): torch_save(model.state_dict(), path)
if TRAIN_AGENT: acc_rewards, stats = train_agent(env_name='Banana', num_episodes=NUM_TRAIN_EPISODES, env_platform='unity', max_num_steps=MAX_NUM_STEPS) experiment_filename = '{epoch_train_time}-a_{alpha}-g_{gamma}-e_{epsilon}-edecay_{epsilon_decay}-emin_{epsilon_min}'\ .format(epoch_train_time=stats.epoch_train_time, alpha=ALPHA, gamma=GAMMA, epsilon=EPSILON_START, epsilon_min=EPSILON_MIN, epsilon_decay=EPSILON_DECAY) print("\n\nScore: {}".format(acc_rewards)) if RESULTS_CONFIG.SAVE_MODEL: torch_save(agent.online_q_network.state_dict(), experiment_filename + 'pth') if RESULTS_CONFIG.SAVE_REWARDS_DATA: pkl_dump(acc_rewards, open('./results/' + experiment_filename + ".p", 'wb')) if TEST_AGENT: model = torch.load('./models/checkpoint' + '.pth') model = agent.target_q_network model.eval() test_epoch_rewards = [] # list containing scores from each episode_idx first_time_solved = False test_start_time = time()
def save(self, file_name): torch_save(self.__ann.state_dict(), file_name)
def save(self, file_name): torch_save(self, file_name)
def save_policy_model(self, model, count): torch_save(model.state_dict(), self.dir + "/models/" + str(count))
def doTask(self, run_id, nn, lvl, task_id, chromosome_id, env_id, rl, poet_loop_counter, noisy=False, algo='CoDE', ngames=1000, popsize=100): """ :param nn: PyTorch nn state_dict :param lvl: flat lvl string :param task_id: EVALUATE the NN or OPTIMIZE it :param chromosome_id: id of NN-GG pair :param rl: use RL? :param poet_loop_counter: poet number loop :return: """ # update network and env to execute on task self.pair.env.generator.update_from_lvl_string(lvl) self.pair.nn.load_state_dict(nn) if task_id == ADPTASK.EVALUATE: self.pair.noisy = noisy score = self.pair.evaluate(rl=rl) return { 'won': self.pair.env.win == 'Win', 'chromosome_id': chromosome_id, 'env_id': env_id, 'score': score, } elif task_id == ADPTASK.OPTIMIZE: # run optimization here if rl: # optimizes in place run_ppo(policy_agent=self.pair, env_fn=self.pair.env.make, path=f'{self.pair.prefix}/runs', pair_id=chromosome_id, outer_poet_loop_count=poet_loop_counter, n_concurrent_games=1, frames=ngames * self.game_length) else: objective = PyTorchObjective(agent=self.pair, popsize=popsize) ans = run_DE(AE_pair=objective, results_prefix=self.prefix, unique_run_id=run_id, pair_id=chromosome_id, poet_loop_counter=poet_loop_counter, generation_max=ngames // popsize, scaling_factor=0.6, crossover_rate=0.4, lower_bound=-5, upper_bound=5) objective.update_nn(ans) del objective # get score of optimized weights score = self.pair.evaluate(rl=rl) state_dict = self.pair.nn.state_dict() # save best weights #destination = f"{self.pair.prefix}/results_{run_id}/{chromosome_id}/final_weights_poet{poet_loop_counter}.pt" #torch_save(state_dict, destination) # did the agent WIN the game? if self.pair.env.win == 'Win': path = os.path.join( f'{self.prefix}', f'{chromosome_id}', f'winning_weights_poet{poet_loop_counter}.pt') torch_save(state_dict, path) return { 'won': self.pair.env.win == 'Win', 'score': score, 'chromosome_id': chromosome_id, 'env_id': env_id, 'nn': state_dict } else: raise ValueError('unspecified task requested')
def save_layer(layer: nn.Module, path: str): logging.info('### Stored layer as {} ###'.format(path)) ensure_dir(path) torch_save(layer.state_dict(), path)
plt.legend(legend, loc='lower right') plt.show() # Show a test image single_image_dataset = SingleImageDataset() while True: command = input("Test model on random image (y/n)? ") if command != 'y': break single_image_dataset.load_image(rotor_dataset.random_image_filepath()) data_loader = DataLoader(single_image_dataset, batch_size=1) data_iter = iter(data_loader) image = data_iter.next() # Generate the steering value prediction output = model(image) _, result_value = torch_max(output, dim=1) result_name = Label.label_index_to_name(result_value) show_image(image, 0, result_value, None) # Save off the model save_model = input("Save model (y/n)? ") if save_model == 'y': model_export_filepath = str(Path(os.getcwd()) / Path('nn_model.pt')) torch_save(model.state_dict(), model_export_filepath)
def train_one(model_folder): new_model_folder_name = model_folder.replace('_created', '_training') os_rename(model_folder, new_model_folder_name) frequencies = glob(os_path_join(new_model_folder_name, 'k_*')) for frequency in frequencies: # Load model print('train.py: training {}'.format(frequency)) model_params_path = os_path_join(frequency, model_params_fname) # create model model, model_params = get_which_model_from_params_fname( model_params_path, return_params=True) if 'cuda' in model_params: using_cuda = model_params['cuda'] and torch_cuda_is_available() else: using_cuda = torch_cuda_is_available() if using_cuda is True: model.cuda() # save initial weights if 'save_initial' in model_params and model_params[ 'save_initial'] and model_params['save_dir']: suffix = '_initial' path = add_suffix_to_path(model_params_fname['save_dir'], suffix) # pylint: disable=E1126 ensure_dir(path) torch_save(model.state_dict(), os_path_join(path, MODEL_DATA_FNAME)) save_model_params(os_path_join(path, model_params_fname), model_params) # loss if 'cost_function' in model_params: loss = model_params['cost_function'] elif 'loss_function' in model_params: loss = model_params['loss_function'] else: raise ValueError( 'model_params missing key cost_function or loss_function') if loss not in ['MSE', 'L1', 'SmoothL1']: raise TypeError('Error must be MSE, L1, or SmoothL1. You gave ' + str(loss)) if loss == 'MSE': from torch.nn import MSELoss loss = MSELoss() elif loss == 'L1': from torch.nn import L1Loss loss = L1Loss() elif loss == 'SmoothL1': from torch.nn import SmoothL1Loss loss = SmoothL1Loss() # optimizer if model_params['optimizer'] == 'Adam': from torch.optim import Adam optimizer = Adam(model.parameters(), lr=model_params['learning_rate'], weight_decay=model_params['weight_decay']) elif model_params['optimizer'] == 'SGD': from torch.optim import SGD optimizer = SGD(model.parameters(), lr=model_params['learning_rate'], momentum=model_params['momentum'], weight_decay=model_params['weight_decay']) else: raise ValueError( 'model_params[\'optimizer\'] must be either Adam or SGD. Got ' + model_params['optimizer']) logger = Logger() # Load training, validation, and test data # Load primary training data dat_train = ApertureDataset( model_params['data_train'], NUM_SAMPLES_TRAIN, k=model_params['k'], target_is_data=model_params['data_is_target']) loader_train = DataLoader(dat_train, batch_size=model_params['batch_size'], shuffle=True, num_workers=DATALOADER_NUM_WORKERS, pin_memory=using_cuda) # Load secondary training data - used to evaluate training loss after every epoch dat_train2 = ApertureDataset( model_params['data_train'], NUM_SAMPLES_TRAIN_EVAL, k=model_params['k'], target_is_data=model_params['data_is_target']) loader_train_eval = DataLoader(dat_train2, batch_size=model_params['batch_size'], shuffle=False, num_workers=DATALOADER_NUM_WORKERS, pin_memory=using_cuda) # Load validation data - used to evaluate validation loss after every epoch dat_val = ApertureDataset( model_params['data_val'], NUM_SAMPLES_VALID, k=model_params['k'], target_is_data=model_params['data_is_target']) loader_val = DataLoader(dat_val, batch_size=model_params['batch_size'], shuffle=False, num_workers=DATALOADER_NUM_WORKERS, pin_memory=using_cuda) trainer = Trainer( model=model, loss=loss, optimizer=optimizer, patience=model_params['patience'], loader_train=loader_train, loader_train_eval=loader_train_eval, loader_val=loader_val, cuda=using_cuda, logger=logger, data_noise_gaussian=model_params['data_noise_gaussian'], save_dir=frequency) # run training trainer.train() os_rename(new_model_folder_name, new_model_folder_name.replace('_training', '_trained'))
[pairs[j].generator for j in range(len(pairs))], availableChildren, transfer_eval=True) send_work(distributed_work, ADPTASK.EVALUATE, parent, unique_run_id, i) # get answers from children transfer_eval_answers = waitForAndCollectAnswers(parent, availableChildren, distributed_work, unique_run_id, i, ADPTASK.EVALUATE) # use information to determine if NN i should migrate to env j. perform_transfer(pairs, transfer_eval_answers, i, unique_run_id, stats) # save checkpoints of networks into POET folder # for pair in pairs: torch_save(pair.nn.state_dict(), os.path.join(tdir, f'network{pair.id}.pt')) with open(os.path.join(tdir, f'lvl{pair.id}.txt'), 'w+') as fname: fname.write(str(pair.generator)) i += 1 pbar.update(1) if i >= args.num_poet_loops: done = True except KeyboardInterrupt as e: print(e) pbar.close() dieAndKillChildren(parent, pairs) import sys sys.exit(0)
def save(self, file_name): torch_save(self.__model.state_dict(), file_name)