def run_model(params, rollout_size=50, num_steps=50): """Perform the training operation. Parameters ---------- params : dict flow-specific parameters (see flow/utils/registry.py) rollout_size : int length of a single rollout num_steps : int total number of training steps Returns ------- stable_baselines.* the trained model """ constructor = env_constructor(params, version=0)() env = DummyVecEnv([lambda: constructor]) model = TRPO( 'MlpPolicy', env, verbose=2, timesteps_per_batch=rollout_size, gamma=0.999, policy_kwargs={ "net_arch": [100, 50, 25] }, ) model.learn(total_timesteps=num_steps) return model
def train_trpo(seed): """ test TRPO on the uav_env(cartesian,discrete) """ """ TRPO(policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log=None, _init_setup_model=True) """ algo = 'TRPO' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 # Tested with: timesteps_per_batch=1024 model = TRPO(policy=MlpPolicy, env=env, gamma=0.99, timesteps_per_batch=128, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = TRPO.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def run_experiment(args): randomization_settings = { "engagement_distance": (100, 100), "turnframes": (args.turnframes, args.turnframes) } if args.randomize_engagement: randomization_settings["engagement_distance"] = (100, 200) vecEnv = None if args.num_envs == 1: # Create dummyvecenv env = gym.make(args.env) env = Monitor( TorilleWrapper(env, 100, args.experiment_name, randomization_settings), args.experiment_name) vecEnv = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run else: vecEnv = [] def make_env(): env = gym.make(args.env) unique_id = str(time.time())[-6:] experiment_env_name = args.experiment_name + ("_env%s" % unique_id) return Monitor( TorilleWrapper(env, 100, experiment_env_name, randomization_settings), experiment_env_name) for i in range(args.num_envs): vecEnv.append(make_env) vecEnv = SubprocVecEnv(vecEnv) steps_per_env = args.steps_per_batch // args.num_envs # Standard 2 x 64 network with sigmoid activations policy_kwargs = dict(act_fun=tf.nn.sigmoid, net_arch=[64, 64]) model = None if args.agent == "ppo": model = PPO2(MlpPolicy, vecEnv, policy_kwargs=policy_kwargs, ent_coef=args.ent_coef, n_steps=steps_per_env, verbose=1) elif args.agent == "trpo": model = TRPO(MlpPolicy, vecEnv, policy_kwargs=policy_kwargs, entcoeff=args.ent_coef, timesteps_per_batch=steps_per_env, verbose=1) model.learn(total_timesteps=args.timesteps)
def trpo(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): from stable_baselines import TRPO env = gym.make(env_id) model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "trpo", env_id, policy, seed)
def trpo(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = TRPO(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with TRPO.") model.learn(total_timesteps=timesteps) env.close()
def run_model(config, budget): """ Initializes the environment in which the model is evaluated, retrieves the values for the current hyperparameter configuration, initializes and trains the given model. Parameters: -------- config: ConfigSpace object containing sampled values for a given hyperparameter configuration budget: how much of a full run is currently used to estimate mean loss Returns: -------- A metric used to evaluate the performance of the current configuration. """ # Fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) seed = np.random.randint(1, 2**31 - 1) tf.set_random_seed(seed) random.seed(seed) env = gym.make('CartPole-v1') env = DummyVecEnv([lambda: env]) # Get all the current hyperparameter values config['timesteps_per_batch'] = config['timesteps_per_batch'] for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']: config[parameter_name] = float(config[parameter_name]) # Initialize model model = TRPO(MlpPolicy, env, verbose=1, timesteps_per_batch=config['timesteps_per_batch'], vf_stepsize=config['vf_stepsize'], max_kl=config['max_kl'], gamma=config['gamma'], lam=config['lam']) total_timesteps = 10000 budget_steps = int(total_timesteps * budget) #I am not sure this is the right way to do it model.learn(total_timesteps=budget_steps) result = evaluate(env, model) return result
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def test_models(env): # seeds = [1, 2, 3] seeds = [1] for s in seeds: # Load Models # models = [A2C.load(f'data/models/a2c_{s}'), # ACKTR.load(f'data/models/acktr_{s}'), # DDPG.load(f'data/models/ddpg_{s}'), # PPO2.load(f'data/models/ppo_{s}'), # SAC.load(f'data/models/sac_{s}'), # TD3.load(f'data/models/td3_{s}'), # TRPO.load(f'data/models/trpo_{s}')] models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load( f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')] for m in models: # run_policy(m, env) og_params = m.get_parameters() generalization_test(m, env) for i in range(50): params = prune_policy(m.__class__.__name__, og_params, 0.1) m.load_parameters(params) generalization_test(m, env)
def my_compute_data(self, args, env, params, n_episodes): env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env)) for alg, start_index, end_index, step, suffix in params: re_d = [] sr_d = [] rewards, s_rates = [], [] for i in range(start_index, end_index, step): print("") print( f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}" ) path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl" print(f"Evaluating model at {path}") if not os.path.exists(path): print(f"WARNING: File {path} does not exist --> SKIPPING") continue if alg == "ddpg": model = DDPG.load(path) elif alg == "ppo": model = PPO2.load(path) else: model = TRPO.load(path) r, su = mean_eval(n_episodes, model, env, False, False) print(f"Average Success Rate: {su}") rewards.append(r) s_rates.append(su[0]) i_max = np.argmax(s_rates) re_d.append(rewards) sr_d.append(s_rates) return re_d, sr_d
def mainUp(arg): test = arg == TEST env = fet.FurutaEnvPosTrpoUp(cm.RUN, render = not test) #env.setRender(True) model = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_UP: print("\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d" % (sum(buf_rew)/float(len(buf_rew)), total_count/float(test_count), test_cutoff_count - overspeed)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 buf_rew.append(episode_rew) if test and count <= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode average reward: %.3f\tCount: %d" % (episode_rew/count, count))
def test(model_path: str, exp_config: dict): test_env, _ = init_env(exp_config) if ALG == 'ddpg': model = DDPG.load(model_path, env=test_env) elif ALG == 'trpo': model = TRPO.load(model_path, env=test_env) elif ALG == 'ppo2': model = PPO2.load(model_path, env=test_env) elif ALG == 'her': # model = HER.load(model_path, env=test_env) raise NotImplemented() else: raise ValueError(f'Unknown algorithm "{ALG}"!') monitor = test_env.envs[0] # type: Monitor assert isinstance(monitor, Monitor) raw_env = monitor.unwrapped # type: GaussianPendulumEnv assert isinstance(raw_env, GaussianPendulumEnv) raw_env.configure(seed=42, mass_mean=(0.05, 1.5), mass_stdev=(0.01, 0.15), embed_knowledge=exp_config.get('embed_knowledge', False), perfect_knowledge=exp_config.get('perfect_knowledge', False), gym_env=test_env) runs = np.zeros((TEST_RUNS, 4)) fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS) for test_ep in range(runs.shape[0]): obs = test_env.reset() if TEST_LINSPACE_MASS: p = raw_env.physical_props raw_env.physical_props = p[0], fixed_masses[test_ep], p[2] mass_distr_params = raw_env.mass_distr_params.copy() sampled_mass = raw_env.physical_props[1] while True: action, states = model.predict(obs, deterministic=True) obs, rewards, dones, info = test_env.step(action) rewards_by_episode = monitor.episode_rewards episode = len(rewards_by_episode) if episode != test_ep: break last_tot_reward = rewards_by_episode[-1] runs[test_ep, :] = mass_distr_params[0], mass_distr_params[ 1], sampled_mass, last_tot_reward avg_reward = runs[:, 3].mean() print(f'Avg. test reward: {avg_reward}\n') return runs
def test_action_mask_run_trpo(vec_env, policy, env_class): env = vec_env([env_class]) model = TRPO(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def test(testing_data, model_file, result): model = TRPO.load(model_file) # set testing environment stock_test_data = StocksData.read_csv(testing_data) stocks_test_env = StocksEnv(stock_test_data, bars_count=10, reset_on_close=False) obs = stocks_test_env.reset() # set vars for recording results result_df = pandas.DataFrame([], columns=['date', 'open', 'action', 'reward']) net_reward = 0.0 while True: action, _states = model.predict(obs) obs, reward, done, info = stocks_test_env.step(action) # print and record the offset, action taken, reward, opening price df = pandas.DataFrame([[ stock_test_data.date[int(info["offset"])], stock_test_data.open[int(info["offset"])], Actions(action).name, reward ]], columns=['date', 'open', 'action', 'reward']) print(df) result_df = result_df.append(df, ignore_index=True) net_reward += reward # at end of episode, record results and exit if done: print('Net Reward: ', net_reward) result_df.to_csv(result, index=False) break
def trpo(env, seed): return TRPO('MlpPolicy', env, vf_iters=5, vf_stepsize=0.001, verbose=1, tensorboard_log="./data/runs", seed=seed)
def train(params): env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("expert_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) if params.get("expert_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) if params.get("expert_name") == 'TRPO' or params.get( "expert_name") == 'PPO': print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=1000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) env.close() del env
def render_to_gif(): def save_frames_as_gif(frames, path='./', filename='growspace_with_trpo.gif'): # Mess with this to change frame size plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(path + filename, writer='imagemagick', fps=60) env = gym.make('GrowSpaceEnv-Control-v0') model = TRPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=2500) # model.save("trpo_cartpole") # del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") frames = [] obs = env.reset() for _ in range(150): # while True: frames.append(env.render(mode="rgb_array")) action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # if done: # break # env.render() env.close() save_frames_as_gif(frames)
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close() # Free memory del env
def loader(algo, env_name): if algo == 'dqn': return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'ppo2': return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'a2c': return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'acer': return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'trpo': return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
def render_growspace_with_trpo(): env = gym.make('GrowSpaceEnv-Control-v0') model = TRPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=2500) # model.save("trpo_cartpole") # # del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() for t in range(150): print(t) # while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) # if dones: # env.reset() env.render()
def trpo(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines import TRPO env = gym.make(env_id) if load_weights is not None: model = TRPO.load(load_weights, env=env, verbose=0) else: model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="trpo", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def mainHybrid(arg): test = arg == TEST env = fet.FurutaEnvPosTrpo(cm.RUN, render = not test) #env.setRender(True) modelBal = TRPO.load(POLICY_PATH + "trpo_pos_policy_bal.zip") modelUp = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 complete_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_HYBRID: print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" % (sum(buf_rew)/float(len(buf_rew)), test_cutoff_count - overspeed, complete_count)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D): action, _ = modelUp.predict(obs) else: action, _ = modelBal.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode reward: %.3f" % (episode_rew))
def f_checkpoints_range_2_mean_performance( self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]: logging.debug( f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}" ) rewards = np.zeros(len(checkpoints)) s_rates = np.zeros(len(checkpoints)) # Intent # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint # - Pass that model to `mean_eval` evaluation function which will evaluate the model on # - a certain number of episodes # - a certain env # - continuous or not continuous space # - an evaluation returns reward and average success rate # # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates j = 0 """ NOTE: i can range in anyway while j iterates over the numpy array """ for i in checkpoints: path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}" logging.debug(f"Evaluating model at {path}") if self.args.model['name'] == "ddpg": model = DDPG.load(path) elif self.args.model['name'] == "ppo": model = PPO2.load(path) elif self.args.model['name'] == "trpo": model = TRPO.load(path) elif self.args.model['name'] == "td3": model = TD3.load(path) elif self.args.model['name'] == "sac": model = SAC.load(path) logging.debug( f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}" ) rewards_list, success_rates_list = mean_eval( num_episodes=self.args.n_episodes, checkpoint_id=i, model=model, env=self.env, v=True, continuous=self.args.continuous, plots_dir=self.args.plots_dir) rewards_mean = np.mean(rewards_list) success_rates_mean = np.mean(success_rates_list) logging.debug( f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}" ) rewards[j] = rewards_mean s_rates[j] = success_rates_mean j += 1 return rewards, s_rates
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_ddpg(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = TRPO("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING trpo") original_env.force_progression = False model.learn(int(2e5), seed=seed) print("DONE LEARING trpo") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text): env_name = name_env #n_cpu = 8 n_cpu = nb_cpu policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512]) print('TB available at := ',tensorboard_log_dir, file=sys.stderr) if name_agent =='A2C': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "A2C_default_Mlp"+text elif name_agent == 'PPO2': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "PPO2_default_Mlp"+text elif name_agent == 'TRPO': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = DummyVecEnv([lambda: env_ for i in range(n_cpu)]) model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "TRPO_default_Mlp"+text time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S') log_name = f"_model={model_name}_time={time}" print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name) training_log = open(f"{console_log_dir}/{log_name}.log", "a") sys.stdout = training_log logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s') model_file_name = f"{models_log_dir}{log_name}_best.pkl" start = datetime.now() print("Learning model", file=sys.stderr) model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback) training_time = datetime.now() - start print(f"Training time: {training_time}", file=sys.stderr) print("Saving final model", file=sys.stderr) model.save(f"{models_log_dir}{log_name}_final.pkl")
def load_model(path: str, algorithm: str): from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO if algorithm == 'PPO2': return PPO2.load(path) if algorithm == 'DQN': return DQN.load(path) if algorithm == 'A2C': return A2C.load(path) if algorithm == 'ACER': return ACER.load(path) if algorithm == 'GAIL': return GAIL.load(path) if algorithm == 'TRPO': return TRPO.load(path) return None
def train_trpo(save_model=False): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) env = gym.make(config.env_name) model = TRPO("CnnPolicy", env, verbose=1) model.learn(total_timesteps=config.num_updates, callback=WandbStableBaselines2Callback()) if save_model: model.save(f"trpo_{config.env_name}")
def train(training_data, training_timesteps, model_file): stocks_data = StocksData.read_csv(training_data) stocks_env = StocksEnv(stocks_data, bars_count=DEFAULT_BARS_COUNT, reset_on_close=False, commission_perc=0.01) model = TRPO(MlpPolicy, stocks_env, verbose=1, tensorboard_log="./tensorboard/") model.learn(total_timesteps=training_timesteps) model.save(model_file)
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v0') model = TRPO.load("pickbot_model_trpo_discrete_2019-03-11 10:22:01") while True: obs, done = env.reset(), False action, _states = model.predict(obs) episode_rew = 0 while not done: obs, rewards, done, info = env.step(action) episode_rew += rewards print("Episode reward", episode_rew)
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v0') model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=200000) print("Saving model to pickbot_model_trpo_discrete_"+timestamp+".pkl") model.save("pickbot_model_trpo_discrete_"+timestamp)
def create_trpo(self): return TRPO(MlpPolicy, self.env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log=config.ROOT_DIR + config.LOG_PATH, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None)