def run_model(params, rollout_size=50, num_steps=50): """Perform the training operation. Parameters ---------- params : dict flow-specific parameters (see flow/utils/registry.py) rollout_size : int length of a single rollout num_steps : int total number of training steps Returns ------- stable_baselines.* the trained model """ constructor = env_constructor(params, version=0)() env = DummyVecEnv([lambda: constructor]) model = TRPO( 'MlpPolicy', env, verbose=2, timesteps_per_batch=rollout_size, gamma=0.999, policy_kwargs={ "net_arch": [100, 50, 25] }, ) model.learn(total_timesteps=num_steps) return model
def train_trpo(seed): """ test TRPO on the uav_env(cartesian,discrete) """ """ TRPO(policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log=None, _init_setup_model=True) """ algo = 'TRPO' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 # Tested with: timesteps_per_batch=1024 model = TRPO(policy=MlpPolicy, env=env, gamma=0.99, timesteps_per_batch=128, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = TRPO.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text): env_name = name_env #n_cpu = 8 n_cpu = nb_cpu policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512]) print('TB available at := ',tensorboard_log_dir, file=sys.stderr) if name_agent =='A2C': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "A2C_default_Mlp"+text elif name_agent == 'PPO2': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "PPO2_default_Mlp"+text elif name_agent == 'TRPO': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = DummyVecEnv([lambda: env_ for i in range(n_cpu)]) model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "TRPO_default_Mlp"+text time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S') log_name = f"_model={model_name}_time={time}" print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name) training_log = open(f"{console_log_dir}/{log_name}.log", "a") sys.stdout = training_log logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s') model_file_name = f"{models_log_dir}{log_name}_best.pkl" start = datetime.now() print("Learning model", file=sys.stderr) model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback) training_time = datetime.now() - start print(f"Training time: {training_time}", file=sys.stderr) print("Saving final model", file=sys.stderr) model.save(f"{models_log_dir}{log_name}_final.pkl")
def run_experiment(args): randomization_settings = { "engagement_distance": (100, 100), "turnframes": (args.turnframes, args.turnframes) } if args.randomize_engagement: randomization_settings["engagement_distance"] = (100, 200) vecEnv = None if args.num_envs == 1: # Create dummyvecenv env = gym.make(args.env) env = Monitor( TorilleWrapper(env, 100, args.experiment_name, randomization_settings), args.experiment_name) vecEnv = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run else: vecEnv = [] def make_env(): env = gym.make(args.env) unique_id = str(time.time())[-6:] experiment_env_name = args.experiment_name + ("_env%s" % unique_id) return Monitor( TorilleWrapper(env, 100, experiment_env_name, randomization_settings), experiment_env_name) for i in range(args.num_envs): vecEnv.append(make_env) vecEnv = SubprocVecEnv(vecEnv) steps_per_env = args.steps_per_batch // args.num_envs # Standard 2 x 64 network with sigmoid activations policy_kwargs = dict(act_fun=tf.nn.sigmoid, net_arch=[64, 64]) model = None if args.agent == "ppo": model = PPO2(MlpPolicy, vecEnv, policy_kwargs=policy_kwargs, ent_coef=args.ent_coef, n_steps=steps_per_env, verbose=1) elif args.agent == "trpo": model = TRPO(MlpPolicy, vecEnv, policy_kwargs=policy_kwargs, entcoeff=args.ent_coef, timesteps_per_batch=steps_per_env, verbose=1) model.learn(total_timesteps=args.timesteps)
def train_trpo(save_model=False): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) env = gym.make(config.env_name) model = TRPO("CnnPolicy", env, verbose=1) model.learn(total_timesteps=config.num_updates, callback=WandbStableBaselines2Callback()) if save_model: model.save(f"trpo_{config.env_name}")
def train(training_data, training_timesteps, model_file): stocks_data = StocksData.read_csv(training_data) stocks_env = StocksEnv(stocks_data, bars_count=DEFAULT_BARS_COUNT, reset_on_close=False, commission_perc=0.01) model = TRPO(MlpPolicy, stocks_env, verbose=1, tensorboard_log="./tensorboard/") model.learn(total_timesteps=training_timesteps) model.save(model_file)
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v0') model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=200000) print("Saving model to pickbot_model_trpo_discrete_"+timestamp+".pkl") model.save("pickbot_model_trpo_discrete_"+timestamp)
def trpo(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): from stable_baselines import TRPO env = gym.make(env_id) model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "trpo", env_id, policy, seed)
def train(params): env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("expert_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) if params.get("expert_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) if params.get("expert_name") == 'TRPO' or params.get( "expert_name") == 'PPO': print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=1000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) env.close() del env
def trpo(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = TRPO(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with TRPO.") model.learn(total_timesteps=timesteps) env.close()
def run_model(config, budget): """ Initializes the environment in which the model is evaluated, retrieves the values for the current hyperparameter configuration, initializes and trains the given model. Parameters: -------- config: ConfigSpace object containing sampled values for a given hyperparameter configuration budget: how much of a full run is currently used to estimate mean loss Returns: -------- A metric used to evaluate the performance of the current configuration. """ # Fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) seed = np.random.randint(1, 2**31 - 1) tf.set_random_seed(seed) random.seed(seed) env = gym.make('CartPole-v1') env = DummyVecEnv([lambda: env]) # Get all the current hyperparameter values config['timesteps_per_batch'] = config['timesteps_per_batch'] for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']: config[parameter_name] = float(config[parameter_name]) # Initialize model model = TRPO(MlpPolicy, env, verbose=1, timesteps_per_batch=config['timesteps_per_batch'], vf_stepsize=config['vf_stepsize'], max_kl=config['max_kl'], gamma=config['gamma'], lam=config['lam']) total_timesteps = 10000 budget_steps = int(total_timesteps * budget) #I am not sure this is the right way to do it model.learn(total_timesteps=budget_steps) result = evaluate(env, model) return result
def run_model(hyperparams, iteration): """ This is the most important function of this script. Initializes the environment in which the model is evaluated, retrieves the values for the current hyperparameter configuration, initializes and trains the given model. Parameters: -------- hyperparams: dictionary containing sampled values for a given hyperparameter configuration iteration: the iteration of running Bayesian optimization, i.e. configuration number Returns: -------- A metric used to evaluate the performance of the current configuration. """ # Fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) seed = np.random.randint(1, 2**31 - 1) tf.set_random_seed(seed) random.seed(seed) env = gym.make('CartPole-v1') env = DummyVecEnv([lambda: env]) # Get all the current hyperparameter values hyperparams['timesteps_per_batch'] = hyperparams['timesteps_per_batch'] for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']: hyperparams[parameter_name] = float(hyperparams[parameter_name]) # Initialize model model = TRPO(MlpPolicy, env, verbose=1, timesteps_per_batch=hyperparams['timesteps_per_batch'], vf_stepsize=hyperparams['vf_stepsize'], max_kl=hyperparams['max_kl'], gamma=hyperparams['gamma'], lam=hyperparams['lam']) model.learn(total_timesteps=10000) model.save("trpo_cartpole_" + str(iteration)) result = evaluate(env, model) return result
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close() # Free memory del env
def trpo(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines import TRPO env = gym.make(env_id) if load_weights is not None: model = TRPO.load(load_weights, env=env, verbose=0) else: model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="trpo", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def train_trpo(env_id, num_timesteps, seed): # env_id: typr str, identifies each environment uniquely # num_timesteps: number of timesteps to run the algorithm # seed: initial random seed # set up the environment rank = MPI.COMM_WORLD.Get_rank() sseed = seed + 10000 * rank set_global_seeds(sseed) env = make_atari(env_id) env.seed(sseed) env = wrap_deepmind(make_atari(env_id)) env.seed(sseed) # define policies policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # define TRPO class object model = TRPO(policy=policy, env=env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_dampling=1e-3, ent_coef=0.0, gamma=0.99, lam=1, vf_iters=3, vf_stepsize=1e-4, verbose=1) # Train TRPO for num_timesteps model.learn(total_timesteps=num_timesteps) # save the hyperparameters and weights model.save('trpo' + env_id) env.close() # free the memory del model
def run(env_name, algorithm, seed): env_name_map = { 'halfcheetah': 'HalfCheetah-v2', 'hopper': 'Hopper-v2', 'ant': 'Ant-v2', 'walker': 'Walker2d-v2' } env = DummyVecEnv([lambda: gym.make(env_name_map[env_name])]) if algorithm == 'ppo': model = PPO2('MlpPolicy', env, learning_rate=1e-3, verbose=1) elif algorithm == 'trpo': model = TRPO('MlpPolicy', env, max_kl=0.01, verbose=1) elif algorithm == 'sac': model = SAC('MlpPolicy', env, learning_rate=1e-3, verbose=1) else: raise NotImplementedError() filepath = '%s_%s_%d.pkl' % (env_name, algorithm, seed) model.learn(total_timesteps=100000, seed=seed) model.save(filepath)
def train(game, num_timesteps, num_envs, dir_name, model_name, prev_model_name): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) log_dir = f"logs/{dir_name}/{model_name}-training" model_dir = f"models/{dir_name}" os.makedirs(log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_envs(game, False, num_envs) prev_model_path = f"{model_dir}/{prev_model_name}.zip" if prev_model_name is not None and os.path.exists(prev_model_path): model = TRPO.load(prev_model_path, env=env) model.tensorboard_log = log_dir else: model = TRPO(policy="MlpPolicy", env=env, gamma=0.8, verbose=1, tensorboard_log=log_dir) model.learn(num_timesteps) model.save(f"{model_dir}/{model_name}.zip") env.close()
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_ddpg(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = TRPO("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING trpo") original_env.force_progression = False model.learn(int(2e5), seed=seed) print("DONE LEARING trpo") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def tst(): def _init_openmpi(): """Pre-load libmpi.dll and register OpenMPI distribution.""" import os import ctypes if os.name != 'nt' or 'OPENMPI_HOME' in os.environ: return try: openmpi_home = os.path.abspath(os.path.dirname(__file__)) openmpi_bin = os.path.join(openmpi_home, 'bin') os.environ['OPENMPI_HOME'] = openmpi_home os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH'])) ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll')) except Exception: pass _init_openmpi() import gym from stable_baselines.common.policies import MlpPolicy, CnnPolicy from stable_baselines import TRPO env = gym.make('BreakoutNoFrameskip-v4') #'CartPole-v1') model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1) model.learn(total_timesteps=25000) model.save("trpo_cartpole") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def train(env, file, steps, arch): start = time.time() #env.setRender(False) # create the learning agent model = TRPO( env=env, policy=MlpPolicy, policy_kwargs=dict(net_arch=arch), n_cpu_tf_sess=None ) # train the agent on the environment model.learn( total_timesteps=steps, log_interval=10, #log_dir=".", #record_video=False ) # save trained model model.save(POLICY_PATH + file, cloudpickle=True) print("Duration: %.1f" % ((time.time() - start)/60))
max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) # model = TRPO(MlpPolicy, env, verbose=1, gamma=0.91, timesteps_per_batch=1000, max_kl=0.05, cg_iters=10, lam=0.9, entcoeff=0.001, cg_damping=0.05, vf_stepsize=0.0003, vf_iters=3, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) model.learn(total_timesteps=14200000) model.save("trpo_quad") # model=TRPO.load("trpo_quad") # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) print(action) print(obs[2]) print(info['z']) # print(i) # print(dones) env.render()
BSS_Controller_Supply_Direction(env_settings_init, budget, open(letter+ "/v4_stepsBudget" + str(budget) + ".csv", 'a+')), "v4" ), ( BSS_Controller_Supply_Direction_Prediction(env_settings_init, budget, open(letter+ "/v6_stepsBudget" + str(budget) + ".csv", 'a+')), "v6" ) ]: accumulatedRew = 0 iterations = 0 outFile = open(letter + "/" + expName + "_perfBudget" + str(budget) + ".csv", 'a+') agent = TRPO(MlpPolicy, env) state = env.reset() start = time.time() print("Beginning to learn " + expName) agent.learn(learnSteps) print(time.time() - start) print("\tDone Learning") for _ in range(evaluationLen): action = agent.predict(state) state, reward, done, info = env.step(action[0]) accumulatedRew += reward iterations += 1 if done: outFile.write(str("%.4f" % (accumulatedRew/iterations)) + "," + str(env.getBudget()) + "\n") accumulatedRew = 0 iterations = 0 env.reset() outFile.close() env.close()
def main(game, num_timesteps, num_episodes, dir_name, model_name, policy, discount=0.99, batch_size=1024): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) eval_log_dir = f"logs/{dir_name}/{model_name}" tr_log_dir = f"{eval_log_dir}-training" model_dir = f"models/{dir_name}" os.makedirs(eval_log_dir, exist_ok=True) os.makedirs(tr_log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_env(game) env.seed(309) model = TRPO(policy=policy, env=env, gamma=discount, timesteps_per_batch=batch_size, verbose=1, seed=309, tensorboard_log=tr_log_dir, n_cpu_tf_sess=1) model.learn(total_timesteps=num_timesteps) model.save(f"{model_dir}/{model_name}") eps_done = 0 ep_rewards = np.array([0] * num_episodes) curr_rewards = 0 obs = env.reset() while eps_done != num_episodes: if eps_done % 10 == 0: print(f"Episodes completed: {eps_done} / {num_episodes}", end="\r") # For vectorised environments, they are automatically reset when done, # so returned obs would be the start state of next episode action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render(mode="human") curr_rewards += reward[0] if done[0]: ep_rewards[eps_done] = curr_rewards curr_rewards = 0 eps_done += 1 print("All episodes completed") env.close() mean = ep_rewards.mean() std_dev = ep_rewards.std() # Outliers: outside of 3 standard deviations outlier_threshold_upper = mean + 3 * std_dev outlier_threshold_lower = mean - 3 * std_dev trimmed_rewards = np.array([ rew for rew in ep_rewards if outlier_threshold_lower <= rew <= outlier_threshold_upper ]) avg_reward = trimmed_rewards.mean() print(f"Average score over {num_episodes} games: {avg_reward:.2f}") summary_writer = tf.summary.FileWriter(eval_log_dir) sess = tf.Session() rew_var = tf.Variable(0, dtype=tf.int64) rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var) for i in range(num_episodes): rew = ep_rewards[i] sess.run(rew_var.assign(rew)) summary_writer.add_summary(sess.run(rew_val), i) avg_var = tf.Variable(0.0, dtype=tf.float64) avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var) sess.run(avg_var.assign(avg_reward)) summary_writer.add_summary(sess.run(avg_val), 0) summary_writer.flush() summary_writer.close() sess.close()
proj = np.eye(rep_model.enc_dim) return ew.TorchEncoderWrapper(pol_env, encnet, proj) print("Training policy...") pol_env = DummyVecEnv([make_policy_env]) # nonlinear policy trained by PPO #model = PPO2(MlpPolicy, pol_env, verbose=0) # linear policy trained by TRPO pol_kwargs = { "net_arch": [dict(vf=[64, 64], pi=[])], "feature_extraction": "mlp", "act_fun": tf.keras.activations.linear } model = TRPO(FFP, pol_env, verbose=0, policy_kwargs=pol_kwargs) model.learn(total_timesteps=pol_timesteps) # evaluate the policy print("Evaluating policy...") n_evals = 5 eval_rollout = int(200 / 3) eval_rewards = [] for _ in range(n_evals): obs = pol_env.reset() rollout_rewards = [] for _ in range(eval_rollout): action, _states = model.predict(obs) obs, rewards, dones, info = pol_env.step(action) rollout_rewards.append(rewards / 3) eval_rewards.append(np.mean(rollout_rewards)) print("Mean eval step reward: {}".format(np.mean(eval_rewards)))
def train(model_path: str): env, raw_env = init_env() raw_env.gravity = 98 model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=300_000) model.save(model_path)
env = gym.make('UR5Gripper-v0') # Create the vectorized environment # env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) env = Monitor(env, log_dir, allow_early_resets=True) # env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)]) # env = SubprocVecEnv([lambda: env]) env = DummyVecEnv([lambda: env]) # env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)]) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization # model = DDPG(MlpPolicy, env, param_noise=param_noise, verbose=1, tensorboard_log=log_dir) # model = PPO2(MlpPolicy, env, verbose=1) # model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) # Random Agent, before training mean_reward_before_train = evaluate(model, num_steps=1000) # Train the agent model.learn(total_timesteps=int(1e7), callback=callback) mean_reward_after_train = evaluate(model, num_steps=1000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
# Create log dir log_dir = "./tmp/deeprmsca-TRPO/" os.makedirs(log_dir, exist_ok=True) callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir) env = gym.make('DeepRMSCA-v0', **env_args) # logs will be saved in log_dir/monitor.csv # in this case, on top of the usual monitored things, we also monitor service and bit rate blocking probabilities env = Monitor(env, log_dir + 'training', info_keywords=('service_blocking_rate_since_reset','bit_rate_blocking_rate_since_reset')) policy_args = dict(net_arch=5*[128], act_fun=tf.nn.elu) # the neural network has four layers with 150 neurons each agent = TRPO(MlpPolicy, env, verbose=0, tensorboard_log="./tb/TRPO-DeepRMSCA-v0/", policy_kwargs=policy_args, gamma=.95, learning_rate=10e-5) agent.learn(total_timesteps=100000, callback=callback) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "DeepRMSCA TRPO") import matplotlib.pyplot as plt def moving_average(values, window): """ Smooth values by doing a moving average :param values: (numpy array) :param window: (int) :return: (numpy array) """ weights = np.repeat(1.0, window) / window return np.convolve(values, weights, 'valid')
feature_extraction="mlp", **_kwargs) device = torch.device("cuda") #env = gym.make('CartPole-v1') log_dir = "/home/mason/perls2/projects/rl_policy_env/policy_log/" env = RLPolicyEnv('projects/rl_policy_env/rl_policy.yaml', False, "TemplateEnv") env = Monitor(env, log_dir) timestep_count = 2000 * 101 #policy = FeedForwardPolicy(net_arch=[128, 128]) model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=timestep_count) #model.save("trpo_cartpole") #del model # remove to demonstrate saving and loading #model = TRPO.load("trpo_cartpole") ep_rewards = np.array(env.episode_rewards) ep_lengths = np.array(env.episode_lengths) ep_mean_rewards = ep_rewards / ep_lengths EPISODE_COUNT = 20 save_loc = log_dir np.save(os.path.join(save_loc, "mean_rewards_arr.npy"), ep_mean_rewards)
from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import TRPO import mujoco_py import pybullet import pybullet_data import pybullet_envs if __name__ == "__main__": # multiprocess environment # for now, it doens't make sense to have multiple environment n_cpu = 1 env = DummyVecEnv([lambda: gym.make('Swimmer-v2') for i in range(n_cpu)]) #model = PPO2.load("ppo2_hopper", env = env, verbose=1, tensorboard_log='./tf_logs/hopper') model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log='./tf_logs') for i in range(100): model.learn(total_timesteps=250000, reset_num_timesteps=False) model.save("model/gym_swimmer/ppo2_swimmer_test_gym_step" + str(i)) # del model # remove to demonstrate saving and loading #model = PPO2.load("ppo2_cartpole") # # Enjoy trained agent # obs = env.reset() # while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()
print('Model choosen not available, check spelling or if it is supported') # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='./pretrain/dummy_quadruped.npz', traj_limitation=-1, batch_size=128) model.pretrain(dataset, n_epochs=args['pt']) if args['pretrainVisualization']: # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0.0 for _ in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render() if done: print(reward_sum) reward_sum = 0.0 obs = env.reset() # As an option, you can train the RL agent model.learn(total_timesteps=args['timesteps']) model.save('./pretrain/Preentrenado_{} bs, {} timesteps'.format( args['bs'], args['timesteps']))