def train_policy(): ppo_config = { "gamma": 0.9988, "n_steps": 200, "ent_coef": 0, "learning_rate": 0.001, "vf_coef": 0.99, "max_grad_norm": 0.1, "lam": 0.95, "nminibatches": 5, "noptepochs": 100, "cliprange": 0.2, "tensorboard_log": log_relative_path } os.makedirs(log_relative_path) policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[256, 128]) env = SubprocVecEnv([_make_env(rank=i) for i in range(5)]) model = PPO2(MlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, verbose=1, **ppo_config) model.learn(total_timesteps=1000, tb_log_name="ppo2", reset_num_timesteps=False) model.save(os.path.join(log_relative_path, 'model')) env.env_method("save_world", log_relative_path) env.close() return
def _eval_model(model, env_id, ob_shape, num_eps, plot=False): test_env = SubprocVecEnv([make_env(env_id)]) sharpe_ratios = [] for episode in range(num_eps): # Padding zeros to the test env to match the shape of the training env. zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape) zero_completed_obs[0, :] = test_env.reset() state = None for _ in range(L): action, state = model.predict(zero_completed_obs, state=state, deterministic=True) zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0] sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0]) if plot: test_env.env_method('render', indices=0) test_env.close() # Return the average sharpe ratio return sum(sharpe_ratios) / len(sharpe_ratios)
if __name__ == "__main__": batch_size = 8 num_envs = 8 num_gpus = torch.cuda.device_count() def make_env(index): return lambda: gym.make( 'MetaEnv-v0', device=torch.device('cuda', index=index % num_gpus)) env = SubprocVecEnv([make_env(x) for x in range(num_envs)], start_method='forkserver') # env.get_valid_actions = lambda: np.array([e.get_valid_actions() for e in env.envs]) env.get_valid_actions = lambda: np.array( env.env_method('get_valid_actions')) model = algo.MaskedPPO(CustomLSTMPolicy, env, verbose=1, n_steps=20, nminibatches=batch_size, tensorboard_log="../out/meta_opt/") model.learn(total_timesteps=100000, log_interval=10) model.save('meta_optimizer') obs = env.reset() state = None total_rewards = 0 done = [False for _ in range(env.num_envs)]
def evaluate_model_on_set( set_path, model, config_path=None, config_kw=None, metrics=("success", "control_variation", "rise_time", "overshoot", "settling_time"), norm_data_path=None, num_envs=1, turbulence_intensity="none", use_pid=False, writer=None, timestep=None, ): """ :param set_path: (str) path to test set file :param model: (PPO2 object or [PIDController]) the controller to be evaluated :param config_path: (str) path to gym environment configuration file :param config_kw: (dict) dictionary of key value pairs to override settings in the configuration file of the gym environment :param metrics: ([str]) list of metrics to be computed and recorded :param norm_data_path: (str) path to folder containing normalization statistics :param num_envs: (int) number of gym environments to run in parallell using multiprocessing :param turbulence_intensity: (str) the intensity setting of the wind turbulence :param use_pid: (bool) Whether the evaluated controller is a PID controller or not :param writer: (tensorboard writer) If supplied, evaluation results will be written to tensorboard log, if not, results are printed to standard output :param timestep: (int) What timestep results are written to when using tensorboard logging :return: (dict) the metrics computed for the evaluated controller on the test set """ scenarios = list(np.load(set_path, allow_pickle=True)) scenario_count = len(scenarios) if config_kw is None: config_kw = {} config_kw.update({ "steps_max": 1500, "target": { "on_success": "done", "success_streak_fraction": 1, "success_streak_req": 100, "states": { 0: { "bound": 5 }, 1: { "bound": 5 }, 2: { "bound": 2 } }, }, }) if use_pid: config_kw["action"] = {"scale_space": False} sim_config_kw = { "turbulence": turbulence_intensity != "None", "turbulence_intensity": turbulence_intensity, } test_env = SubprocVecEnv([ make_env(config_path, i, config_kw=config_kw, sim_config_kw=sim_config_kw) for i in range(num_envs) ]) if use_pid: dt = test_env.get_attr("simulator")[0].dt for pid in model: pid.dt = dt env_cfg = test_env.get_attr("cfg")[0] obs_states = [var["name"] for var in env_cfg["observation"]["states"]] try: phi_i, theta_i, Va_i = ( obs_states.index("roll"), obs_states.index("pitch"), obs_states.index("Va"), ) omega_i = [ obs_states.index("omega_p"), obs_states.index("omega_q"), obs_states.index("omega_r"), ] except ValueError: print( "When using PID roll, pitch, Va, omega_p, omega_q, omega_r must be part of the observation vector." ) else: test_env = VecNormalize(test_env) if model.env is not None: test_env.obs_rms = model.env.obs_rms test_env.ret_rms = model.env.ret_rms else: assert norm_data_path is not None test_env.load_running_average(norm_data_path) test_env.training = False res = {metric: {} for metric in metrics} res["rewards"] = [[] for i in range(scenario_count)] active_envs = [i < scenario_count for i in range(num_envs)] env_scen_i = [i for i in range(num_envs)] test_done = False obs = np.array( [np.zeros(test_env.observation_space.shape) for i in range(num_envs)]) done = [True for i in range(num_envs)] info = None while not test_done: for i, env_done in enumerate(done): if env_done: if len(scenarios) > 0 or active_envs[i]: if len(scenarios) > 0: print("{}/{} scenarios left".format( len(scenarios), scenario_count)) scenario = scenarios.pop(0) env_scen_i[i] = (scenario_count - 1) - len(scenarios) obs[i] = test_env.env_method("reset", indices=i, **scenario)[0] if use_pid: model[i].reset() model[i].set_reference( scenario["target"]["roll"], scenario["target"]["pitch"], scenario["target"]["Va"], ) else: active_envs[i] = False if info is not None: for metric in metrics: if isinstance(info[i][metric], dict): for state, value in info[i][metric].items(): if state not in res[metric]: res[metric][state] = [] res[metric][state].append(value) else: if "all" not in res[metric]: res[metric]["all"] = [] res[metric]["all"].append(info[i][metric]) if len(scenarios) == 0: test_done = not any(active_envs) if use_pid: actions = [] for i, pid in enumerate(model): roll, pitch, Va = obs[i, phi_i], obs[i, theta_i], obs[i, Va_i] omega = obs[i, omega_i] if info is not None and "target" in info[i]: pid.set_reference( phi=info[i]["target"]["roll"], theta=info[i]["target"]["pitch"], va=info[i]["target"]["Va"], ) actions.append(pid.get_action(roll, pitch, Va, omega)) actions = np.array(actions) else: actions, _ = model.predict(obs, deterministic=True) obs, rew, done, info = test_env.step(actions) for i, env_rew in enumerate(rew): res["rewards"][env_scen_i[i]].append(env_rew) if writer is not None: summaries = [] for metric, metric_v in res.items(): if isinstance(res[metric], dict): for state, v in res[metric].items(): summaries.append( tf.Summary.Value( tag="test_set/{}_{}".format(metric, state), simple_value=np.nanmean(v), )) writer.add_summary(tf.Summary(value=summaries), timestep) else: print_results(res) return res
def main(): args = get_configuration() args.state_dim = util.get_state_dim(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir, exist_ok=True) if args.graph_embedding: class MyPolicy(EmbeddingPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=reuse, **_kwargs) else: class MyPolicy(EnigmaPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=reuse, **_kwargs) t0 = time.time() from mpi4py import MPI as mpi comm = mpi.COMM_WORLD rank = comm.Get_rank() all = comm.Get_size() gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(',') gpu_count = len(gpus) gpu = gpus[rank % gpu_count] os.environ["CUDA_VISIBLE_DEVICES"] = gpu print("My rank is {} out of {}, using GPU {}".format(rank, all, gpu)) if args.model_type == "ppo2": from stable_baselines import PPO2 as PPO env = SubprocVecEnv([(lambda: ProofEnv.ProofEnv(args)) for _ in range(args.parallel_envs) ]) #, start_method="spawn") elif args.model_type == "ppo1": args.parallel_envs = 1 env = DummyVecEnv([lambda: ProofEnv.ProofEnv(args)]) # from stable_baselines import PPO1 as PPO from ppo import PPO1 as PPO if args.saved_model == None: myPolicy = MyPolicy if args.model_type == "ppo2": model = PPO( policy=myPolicy, env=env, n_steps=args.actorbatch, # nminibatches=args.optim_stepsize, lam=0.95, gamma=args.gamma, noptepochs=4, ent_coef=args.entcoeff, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) elif args.model_type == "ppo1": model = PPO(myPolicy, env, verbose=2, timesteps_per_actorbatch=args.actorbatch, schedule=args.lr_schedule, optim_stepsize=args.optim_stepsize, entcoeff=args.entcoeff, optim_batchsize=args.optim_batchsize, gamma=args.gamma) else: print("Loading model from {}".format(args.saved_model)) model = PPO.load(args.saved_model) model.set_env(env) counter = 0 for ind in range(args.parallel_envs): env.env_method("set_model", model, indices=list(range(args.parallel_envs))) modelfiles = [] for train_timestep, train_dir in zip(args.train_timesteps, args.train_dirs): problem_files = sorted(util.list_problems(train_dir)) problem_files = util.split_list(problem_files, all)[rank] problem_files_splitted = util.split_list(problem_files, args.parallel_envs, extensible=False) if args.add_repeating_pretraining: for ind in range(args.parallel_envs): env.env_method("set_source", problem_files_splitted[ind], indices=[ind], generator_type="repeating") # all_thread_timestep = train_timestep * all print("PRETRAINING") model.learn(total_timesteps=train_timestep) print("Pretraining on {} finished in {}".format( train_dir, util.format_time(time.time() - t0))) for ind in range(args.parallel_envs): env.env_method("set_source", problem_files_splitted[ind], indices=[ind]) # all_thread_timestep = train_timestep * all model.learn(total_timesteps=train_timestep) modelfile = "{}/ppo1_fcop_train_{}".format(args.outdir, counter) modelfiles.append(modelfile) if rank == 0: model.save(modelfile) # logger.logkv("finished_train_problems", counter) counter += 1 print("Training on {} finished in {}".format( train_dir, util.format_time(time.time() - t0))) statistics_list = env.get_attr("statistics", indices=list(range(args.parallel_envs))) blacklist_list = env.get_attr("blacklist", indices=list(range(args.parallel_envs))) for i, statistics in enumerate(statistics_list): print("ENV {} - {} - blacklist: {}\n".format( rank, i, blacklist_list[i])), util.print_problemdict(statistics, rank) # for f in statistics: # statistics[f]["mcts"].display_tree([0]) # util.print_problemdict(env.envs[0].statistics) if len(args.train_dirs) > 0 and len( args.train_timesteps) > 0: # we did training print("We have finished training, rank {}".format(rank)) # for p in problem_files: # vis_policy.vis_policy(env.envs[0], model, p) env.close() del env del model # here we wait for everyone comm.Barrier() print("We have started evaluation, rank {}".format(rank)) # evaluation without training if (args.saved_model is not None) and (len( args.train_dirs) == 0): # no training, just evaluation modelfiles = [args.saved_model] for evaldir in args.evaldirs: for model_index, modelfile in enumerate(modelfiles): eval.eval_mpi(args, evaldir, modelfile, model_index) # here we wait for everyone comm.Barrier()
logging_level=logging_level) return create_yumi logging_level = logging.DEBUG if args.debug else logging.INFO yumis = [make_env(args.render, i, seed=i) for i in range(n_cpu)] env = SubprocVecEnv(yumis) model = PPO2.load(args.model_path, env=env, policy=MlpPolicy) n_episodes = 100 if real else 5000 states, actions, next_states, parameters, steps = [], [], [], [], [] horizon = env.env_method('get_horizon')[0] n_steps = (horizon * n_episodes) // n_cpu deterministic = False obs = env.reset() for ep in range(n_steps): states.extend(obs) action, _states = model.predict(obs, deterministic=deterministic) obs, rewards, done, info = env.step(action) actions.extend(action) next_states.extend(obs) dynamics = env.env_method('get_dynamics') parameters.extend(dynamics) steps.append(env.env_method('get_step'))
def main(args): log_dir = args.log_path if (args.log_path is not None) else \ "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') configure_logger(log_dir) set_global_seeds(args.seed) n_cpu = get_num_workers(args.env) if not args.play else 1 env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, args.n_object, args.curriculum) def make_thunk(rank): return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 if "use_cu" in eval_env_kwargs: eval_env_kwargs['use_cu'] = False eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) print(eval_env) if not args.play: os.makedirs(log_dir, exist_ok=True) train_kwargs = get_train_kwargs("ppo", args, parsed_action_noise=None, eval_env=eval_env) # policy = 'MlpPolicy' from utils.attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("ppo", args) print(policy_kwargs) model = PPO2(args.policy, env, verbose=1, nminibatches=32, lam=0.95, noptepochs=10, ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, **train_kwargs) print(model.get_parameter_list()) def callback(_locals, _globals): num_update = _locals["update"] if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(num_update, mean_eval_reward) if num_update % 10 == 0: model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) model.save(model_path) print('model saved to', model_path) return True model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) model.save(os.path.join(log_dir, 'final')) else: assert args.load_path is not None model = PPO2.load(args.load_path) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() goal_dim = env.get_attr('goal')[0].shape[0] if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPush' in args.env: while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61 and 0.7 < obs[0][4] < 0.8): obs = env.reset() env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0])) obs = env.env_method('get_obs') obs[0] = np.concatenate([ obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal'] ]) else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal', obs[0][-goal_dim:]) episode_reward = 0.0 num_episode = 0 frame_idx = 0 images = [] if 'max_episode_steps' not in env_kwargs.keys(): env_kwargs['max_episode_steps'] = 100 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) if env.get_attr('goal')[0].shape[0] <= 3: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx)) else: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', goal idx ' + str(np.argmax(env.get_attr('goal')[0][3:]))) if 'FetchStack' in args.env: tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 * goal_dim])]) images.append(img) action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if not args.export_video: plt.pause(0.1) else: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) if done: print('episode_reward', episode_reward) if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('goal', obs[0][-goal_dim:]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 10: break if args.export_video: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
env.seed(seed + rank) return env set_global_seeds(seed) return _init if __name__ == '__main__': log_path = os.path.join(*__file__.split('/')[:-2], 'log', 'run_' + datetime.now().strftime('%m%d%H%M')) env_vec = SubprocVecEnv([make_env(i) for i in range(4)], start_method='spawn') net_arch = [dict(pi=[512, 256], vf=[512, 256])] obs_norm_init = env_vec.env_method('obs_norm_params', indices=0)[0] act_norm_init = env_vec.env_method('act_norm_params', indices=0)[0] policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=net_arch, obs_norm_init=obs_norm_init, act_norm_init=act_norm_init) n_time_step = 10**5 model = PPO2(NormalMlpPolicy, env_vec, gamma=0.95, n_steps=8192, nminibatches=4, noptepochs=4, learning_rate=5.0 * 10**(-4),
while True: # env.set_attr("keyboard_u", keyboard_u) env.render() action, _states = model.predict(obs, deterministic=True) action[0] = 0 obs, rewards, dones, info = env.step(action) episode_reward += rewards[0] if dones[0]: performance[cnt, 0] = episode_reward episode_reward = 0 performance[cnt, 1] = env.get_attr("record_count")[0] # print(env.get_attr("record_count")) performance[cnt, 2] = env.env_method("why_done")[0] # print(env.env_method("why_done")) if int(performance[cnt, 2]) != 0: performance[cnt, 1] = np.inf cnt += 1 break print(performance) print(np.mean(performance[:, 0]), np.min(performance[:, 1]) * 0.1, np.max(performance[:, 1]) * 0.1, np.mean(performance[:, 1]) * 0.1, len(performance[performance[:, 2] == 0]), len(performance[performance[:, 2] == 1]), len(performance[performance[:, 2] == 2]))
def test(model_name, env_name, num_cpu, log_dir): env_id = env_name + 'NoFrameskip-v4' env = SubprocVecEnv([ make_env(env_id, i, log_dir, useMonitor=False) for i in range(num_cpu) ]) # env = Monitor(env, log_dir, allow_early_resets=True) model = get_model(model_name, env, log_dir) model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() from matplotlib import pyplot as plt show_num = 1 while True: action, _states = model.predict(obs) # obs, rewards, done, info = env.step([int(input('action:'))]*num_cpu) obs, rewards, done, info = env.step(action) img = obs[show_num, :, :, :] fig = plt.figure(0) plt.clf() plt.imshow(img / 255) fig.canvas.draw() if 'SelfAttention' in model_name and 'Box' in env_name and 'World' in env_name: agent_position = env.env_method( 'get_current_agent_position')[show_num] print('agent_position', agent_position) attention = model.get_attention(obs, _states, done)[0] # head_0 attention0 = attention[show_num][0][agent_position] left = [ attention0[agent_position - 14] if agent_position - 14 >= 0 else 0 ] right = [ attention0[agent_position + 14] if agent_position + 14 > 155 else 0 ] attention0_udlr = [ left, right, attention0[agent_position - 1], attention0[agent_position + 1] ] # print('top :{} left:{}'.format(attention0[agent_position-14],attention0[agent_position-1])) attention0 = np.reshape(attention0, [14, 14]) fig = plt.figure(2) plt.clf() plt.imshow(attention0, cmap='gray') fig.canvas.draw() # head_1 attention1 = attention[show_num][1][agent_position] left = [ attention1[agent_position - 14] if agent_position - 14 >= 0 else 0 ] right = [ attention1[agent_position + 14] if agent_position + 14 > 155 else 0 ] attention1_udlr = [ left, right, attention1[agent_position - 1], attention1[agent_position + 1] ] attention1 = np.reshape(attention1, [14, 14]) fig = plt.figure(3) plt.clf() plt.imshow(attention1, cmap='gray') fig.canvas.draw() print(action[show_num], np.argmax(attention0_udlr), np.argmax(attention1_udlr), "max attention:", np.max(attention0_udlr), np.max(attention1_udlr)) # env.render() plt.pause(0.000001)
def run_experiment( not_save=False, folder='experiments', weights_location=None, tag=None, env='Base', env_num=4, n=0, save_interval=10000, train_steps=int(1e6), description=None, weights=None, n_steps=200, gamma=0.99, max_steps=None, ): if weights is not None and not os.path.isfile(weights): raise ValueError("Weights do not exist") # Saving args args = deepcopy(locals()) # Get env env = getattr(environments, env) # Generate environments if max_steps is not None: env = SubprocVecEnv( [lambda: env(max_steps=max_steps) for i in range(env_num)]) else: env = SubprocVecEnv([lambda: env() for i in range(env_num)]) args['env_config'] = str(env.env_method("get_org_config")[0]) # Check if folder exists and if is a valid name if not not_save: id,logger,logs_folder,experiment_csv,experiment_folder = \ create_experiment_folder(folder=folder,tag=tag,args=args) else: id = -1 logs_folder = None logger = None experiment_folder = None if weights is not None: model = PPO2.load( weights, verbose=0, tensorboard_log=logs_folder, max_grad_norm=100, n_steps=n_steps, gamma=gamma, #policy_kwargs={'data_format':'NCHW'}, ) model.set_env(env) else: model = PPO2( CnnPolicy, env, verbose=0, tensorboard_log=logs_folder, max_grad_norm=100, n_steps=n_steps, #policy_kwargs={'data_format':'NCHW'}, ) # set bar callback = Callback( not_save=not_save, logger=logger, train_steps=train_steps, n=n, experiment_folder=experiment_folder, save_interval=save_interval, id=id, ) # Start running experiment # Creating nice table _width = 40 del args['env_config'] max_k_width = max([len(k) for k in args]) print("\n{}".format("#" * _width)) print("# {1:^{0}} #".format(_width - 4, "RUNNING EXPERIMENT")) print("# {1:^{0}} #".format(_width - 4, "")) print("# {1:<{0}} #".format( _width - 4, "{0:{2}s}: {1:03d}".format("ID", id, max_k_width))) for k, v in args.items(): if type(v) is int: print("# {1:<{0}} #".format( _width - 4, "{0:{2}s}: {1:0d}".format(k, v, max_k_width))) elif type(v) is float: print("# {1:<{0}} #".format( _width - 4, "{0:{2}s}: {1:0.3f}".format(k, v, max_k_width))) else: print("# {1:<{0}} #".format( _width - 4, "{0:{2}s}: {1:s}".format(k, str(v), max_k_width))) print("{}".format("#" * _width)) del args print("\n############ STARTING TRAINING ###########\n") try: with tqdm.tqdm(total=train_steps, leave=True) as bar: callback.set_bars(bar) model.learn( total_timesteps=train_steps, callback=callback, ) if not not_save: model.save(experiment_folder + "/weights_final") except KeyboardInterrupt: if not not_save and input( "Do you want to DELETE this experiment? (Yes/n) ") == "Yes": remove_experiment(experiment_folder, folder, experiment_csv, id) else: if not not_save: model.save(experiment_folder + "/weights_final")