def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text): env_name = name_env #n_cpu = 8 n_cpu = nb_cpu policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512]) print('TB available at := ',tensorboard_log_dir, file=sys.stderr) if name_agent =='A2C': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "A2C_default_Mlp"+text elif name_agent == 'PPO2': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "PPO2_default_Mlp"+text elif name_agent == 'TRPO': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = DummyVecEnv([lambda: env_ for i in range(n_cpu)]) model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "TRPO_default_Mlp"+text time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S') log_name = f"_model={model_name}_time={time}" print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name) training_log = open(f"{console_log_dir}/{log_name}.log", "a") sys.stdout = training_log logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s') model_file_name = f"{models_log_dir}{log_name}_best.pkl" start = datetime.now() print("Learning model", file=sys.stderr) model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback) training_time = datetime.now() - start print(f"Training time: {training_time}", file=sys.stderr) print("Saving final model", file=sys.stderr) model.save(f"{models_log_dir}{log_name}_final.pkl")
def train_trpo(save_model=False): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) env = gym.make(config.env_name) model = TRPO("CnnPolicy", env, verbose=1) model.learn(total_timesteps=config.num_updates, callback=WandbStableBaselines2Callback()) if save_model: model.save(f"trpo_{config.env_name}")
def train(training_data, training_timesteps, model_file): stocks_data = StocksData.read_csv(training_data) stocks_env = StocksEnv(stocks_data, bars_count=DEFAULT_BARS_COUNT, reset_on_close=False, commission_perc=0.01) model = TRPO(MlpPolicy, stocks_env, verbose=1, tensorboard_log="./tensorboard/") model.learn(total_timesteps=training_timesteps) model.save(model_file)
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v0') model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=200000) print("Saving model to pickbot_model_trpo_discrete_"+timestamp+".pkl") model.save("pickbot_model_trpo_discrete_"+timestamp)
def train(params): env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("expert_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) if params.get("expert_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) if params.get("expert_name") == 'TRPO' or params.get( "expert_name") == 'PPO': print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=1000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) env.close() del env
def run_model(hyperparams, iteration): """ This is the most important function of this script. Initializes the environment in which the model is evaluated, retrieves the values for the current hyperparameter configuration, initializes and trains the given model. Parameters: -------- hyperparams: dictionary containing sampled values for a given hyperparameter configuration iteration: the iteration of running Bayesian optimization, i.e. configuration number Returns: -------- A metric used to evaluate the performance of the current configuration. """ # Fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) seed = np.random.randint(1, 2**31 - 1) tf.set_random_seed(seed) random.seed(seed) env = gym.make('CartPole-v1') env = DummyVecEnv([lambda: env]) # Get all the current hyperparameter values hyperparams['timesteps_per_batch'] = hyperparams['timesteps_per_batch'] for parameter_name in ['vf_stepsize', 'max_kl', 'gamma', 'lam']: hyperparams[parameter_name] = float(hyperparams[parameter_name]) # Initialize model model = TRPO(MlpPolicy, env, verbose=1, timesteps_per_batch=hyperparams['timesteps_per_batch'], vf_stepsize=hyperparams['vf_stepsize'], max_kl=hyperparams['max_kl'], gamma=hyperparams['gamma'], lam=hyperparams['lam']) model.learn(total_timesteps=10000) model.save("trpo_cartpole_" + str(iteration)) result = evaluate(env, model) return result
def train_trpo(env_id, num_timesteps, seed): # env_id: typr str, identifies each environment uniquely # num_timesteps: number of timesteps to run the algorithm # seed: initial random seed # set up the environment rank = MPI.COMM_WORLD.Get_rank() sseed = seed + 10000 * rank set_global_seeds(sseed) env = make_atari(env_id) env.seed(sseed) env = wrap_deepmind(make_atari(env_id)) env.seed(sseed) # define policies policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # define TRPO class object model = TRPO(policy=policy, env=env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_dampling=1e-3, ent_coef=0.0, gamma=0.99, lam=1, vf_iters=3, vf_stepsize=1e-4, verbose=1) # Train TRPO for num_timesteps model.learn(total_timesteps=num_timesteps) # save the hyperparameters and weights model.save('trpo' + env_id) env.close() # free the memory del model
def run(env_name, algorithm, seed): env_name_map = { 'halfcheetah': 'HalfCheetah-v2', 'hopper': 'Hopper-v2', 'ant': 'Ant-v2', 'walker': 'Walker2d-v2' } env = DummyVecEnv([lambda: gym.make(env_name_map[env_name])]) if algorithm == 'ppo': model = PPO2('MlpPolicy', env, learning_rate=1e-3, verbose=1) elif algorithm == 'trpo': model = TRPO('MlpPolicy', env, max_kl=0.01, verbose=1) elif algorithm == 'sac': model = SAC('MlpPolicy', env, learning_rate=1e-3, verbose=1) else: raise NotImplementedError() filepath = '%s_%s_%d.pkl' % (env_name, algorithm, seed) model.learn(total_timesteps=100000, seed=seed) model.save(filepath)
def train(game, num_timesteps, num_envs, dir_name, model_name, prev_model_name): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) log_dir = f"logs/{dir_name}/{model_name}-training" model_dir = f"models/{dir_name}" os.makedirs(log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_envs(game, False, num_envs) prev_model_path = f"{model_dir}/{prev_model_name}.zip" if prev_model_name is not None and os.path.exists(prev_model_path): model = TRPO.load(prev_model_path, env=env) model.tensorboard_log = log_dir else: model = TRPO(policy="MlpPolicy", env=env, gamma=0.8, verbose=1, tensorboard_log=log_dir) model.learn(num_timesteps) model.save(f"{model_dir}/{model_name}.zip") env.close()
def tst(): def _init_openmpi(): """Pre-load libmpi.dll and register OpenMPI distribution.""" import os import ctypes if os.name != 'nt' or 'OPENMPI_HOME' in os.environ: return try: openmpi_home = os.path.abspath(os.path.dirname(__file__)) openmpi_bin = os.path.join(openmpi_home, 'bin') os.environ['OPENMPI_HOME'] = openmpi_home os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH'])) ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll')) except Exception: pass _init_openmpi() import gym from stable_baselines.common.policies import MlpPolicy, CnnPolicy from stable_baselines import TRPO env = gym.make('BreakoutNoFrameskip-v4') #'CartPole-v1') model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1) model.learn(total_timesteps=25000) model.save("trpo_cartpole") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def train(env, file, steps, arch): start = time.time() #env.setRender(False) # create the learning agent model = TRPO( env=env, policy=MlpPolicy, policy_kwargs=dict(net_arch=arch), n_cpu_tf_sess=None ) # train the agent on the environment model.learn( total_timesteps=steps, log_interval=10, #log_dir=".", #record_video=False ) # save trained model model.save(POLICY_PATH + file, cloudpickle=True) print("Duration: %.1f" % ((time.time() - start)/60))
obs = pol_env.reset() rollout_rewards = [] for _ in range(eval_rollout): action, _states = model.predict(obs) obs, rewards, dones, info = pol_env.step(action) rollout_rewards.append(rewards / 3) eval_rewards.append(np.mean(rollout_rewards)) print("Mean eval step reward: {}".format(np.mean(eval_rewards))) # update the policy and sampler objects pol = EncoderPolicy(TorchStateEncoder(encnet), model) sampler = srt.PolicyTrajectorySampler(env, pol, T) # save stuff torch.save(rep_model, "./repnet") model.save("./model") # train the model more? """ repmodel = torch.load("./repnet") encnet = repmodel.encoder #model = PPO2.load("./model") def make_policy_env(): repeats = 3 pol_env = RestartablePendulumEnv(repeats=repeats,pixels=True) # can specify cost="dm_control" pol_env = TimeLimit(pol_env,max_episode_steps=int(200/repeats)) # only run the environment for 200 true steps proj = np.eye(rep_model.enc_dim) return ew.TorchEncoderWrapper(pol_env,encnet,proj) print("Training policy linear...")
def train(model_path: str): env, raw_env = init_env() raw_env.gravity = 98 model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=300_000) model.save(model_path)
print('Model choosen not available, check spelling or if it is supported') # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='./pretrain/dummy_quadruped.npz', traj_limitation=-1, batch_size=128) model.pretrain(dataset, n_epochs=args['pt']) if args['pretrainVisualization']: # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0.0 for _ in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render() if done: print(reward_sum) reward_sum = 0.0 obs = env.reset() # As an option, you can train the RL agent model.learn(total_timesteps=args['timesteps']) model.save('./pretrain/Preentrenado_{} bs, {} timesteps'.format( args['bs'], args['timesteps']))
def main(): # parameters for the gym_carla environment params = { 'number_of_vehicles': 8, 'number_of_walkers': 0, 'display_size': 256, # screen size of bird-eye render 'max_past_step': 1, # the number of past steps to draw 'dt': 0.1, # time interval between two frames 'discrete': True, # whether to use discrete control space 'continuous_accel_range': [-3.0, 3.0], # continuous acceleration range 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'town': 'Town06', # which town to simulate 'task_mode': 'acc_1', # mode of the task, [random, roundabout (only for Town03)] 'max_time_episode': 1000, # maximum timesteps per episode 'max_waypt': 12, # maximum number of waypoints 'obs_range': 32, # observation range (meter) 'lidar_bin': 0.125, # bin size of lidar sensor (meter) 'd_behind': 12, # distance behind the ego vehicle (meter) 'out_lane_thres': 2.0, # threshold for out of lane 'desired_speed': 16.67, # desired speed (m/s) 'max_ego_spawn_times': 200, # maximum times to spawn ego vehicle 'display_route': True, # whether to render the desired route 'pixor_size': 64, # size of the pixor labels 'pixor': False, # whether to output PIXOR observation 'RGB_cam': True, # whether to use RGB camera sensor } solver_params = { 'layers': [64, 64, 64], 'alpha': 0.001, 'gamma': 0.99, 'epsilon': 0.1, 'replay_memory_size': 500000, 'update_target_estimator_every': 10000, 'batch_size': 64, } # Set gym-carla environment env = gym.make('carla-v0', params=params) # check_env(env) obs = env.reset() checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./trpo_checkpoint/', name_prefix='trpo_check') #model = DQN.load("./trpo_checkpoint/trpo_check_200_steps.zip",env=env,tensorboard_log="./trpo) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log="./trpo") model.learn(total_timesteps=35000, tb_log_name="35k-with-checkoint", callback=checkpoint_callback) model.save("trpo_carla") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_carla") obs = env.reset() for i in range(100): while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones: obs = env.reset() break
max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) # model = TRPO(MlpPolicy, env, verbose=1, gamma=0.91, timesteps_per_batch=1000, max_kl=0.05, cg_iters=10, lam=0.9, entcoeff=0.001, cg_damping=0.05, vf_stepsize=0.0003, vf_iters=3, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) model.learn(total_timesteps=14200000) model.save("trpo_quad") # model=TRPO.load("trpo_quad") # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) print(action) print(obs[2]) print(info['z']) # print(i) # print(dones) env.render()
worker_id = 10 num_env = 2 env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux" env = UnityEnv(env_id, worker_id=worker_id, use_visual=False) # Create log dir time_int = int(time.time()) log_dir = "stable_results/basic_env_{}/".format(time_int) os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=20000) model.save(log_dir+"model") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True: action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) total_l += 1. total_r += rewards if dones:
environment = 'Swimmer-v2' path = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str( run) + '_total_timesteps=' + str( total_timesteps) + '_trpo_episode_reward.npy' pathmodel = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str( run) + '_total_timesteps=' + str(total_timesteps) + '_trpo' env = gym.make(environment) env = DummyVecEnv([lambda: env]) # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=total_timesteps, path=path, seed=seed) model.save(pathmodel) # Don't forget to save the running average when saving the agent log_dir = "/tmp/" env.save_running_average(log_dir) ''' del model # remove to demonstrate saving and loading ''' model = TRPO.load("") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
env_dict = { 'id': 'prescan-without-matlabengine-v0', 'verbose': True, 'host': '172.21.217.140', 'nget': 150 } env = gym.make(**env_dict) env = DummyVecEnv([lambda: env]) model = TRPO(MlpPolicy, env, verbose=1) try: model.learn(total_timesteps=50000) except: print('Error!') pass model.save(save_load) ''' del model # remove to demonstrate saving and loading model = TRPO.load(save_load) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() '''
brsEngine = DubinsCar_brs_engine() brsEngine.reset_variables() elif args['gym_env'] == 'PlanarQuadEnv-v0': brsEngine = Quadrotor_brs_engine() brsEngine.reset_variables() else: raise ValueError("invalid environment name for ttr reward!") # You have to assign the engine! env.brsEngine = brsEngine elif args['reward_type'] in ['hand_craft','distance','distance_lambda_10','distance_lambda_1','distance_lambda_0.1']: pass else: raise ValueError("wrong type of reward") # ---------------------------------------------------------------------------------------------------- args['RUN_DIR'] = RUN_DIR args['MODEL_DIR'] = MODEL_DIR args['FIGURE_DIR'] = FIGURE_DIR args['RESULT_DIR'] = RESULT_DIR # make necessary directories maybe_mkdir(RUN_DIR) maybe_mkdir(MODEL_DIR) maybe_mkdir(FIGURE_DIR) maybe_mkdir(RESULT_DIR) model = TRPO(MlpPolicy, env, verbose=1, **args) # 600 epochs, each epoch 1024 steps; every 30 epochs, do an evaluation. model.learn(total_timesteps=1024*601) model.save(MODEL_DIR)
from stable_baselines import TRPO import mujoco_py from snake_env.gym_swimmer_env import SwimmerLocomotionEnv # multiprocess environment # n_cpu = 4 # env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) fixed_path = [(-0.2 * i, 0) for i in range(30)] use_random_path = False robot_k = 1.0 robot_link_length = 0.3 gamma = 0.995 if __name__ == "__main__": # multiprocess environment # for now, it doens't make sense to have multiple environment n_cpu = 1 env = SubprocVecEnv([ lambda: SwimmerLocomotionEnv(path=fixed_path, random_path=use_random_path, use_hard_path=False, robot_link_length=robot_link_length) for i in range(n_cpu) ]) #model = PPO2.load("ppo2_hopper", env = env, verbose=1, tensorboard_log='./tf_logs/hopper') model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log='./tf_logs') for i in range(100): model.learn(total_timesteps=250000, reset_num_timesteps=False) model.save("real_trpo_swimmer_traj_following")
def main(game, num_timesteps, num_episodes, dir_name, model_name, policy, discount=0.99, batch_size=1024): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) eval_log_dir = f"logs/{dir_name}/{model_name}" tr_log_dir = f"{eval_log_dir}-training" model_dir = f"models/{dir_name}" os.makedirs(eval_log_dir, exist_ok=True) os.makedirs(tr_log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_env(game) env.seed(309) model = TRPO(policy=policy, env=env, gamma=discount, timesteps_per_batch=batch_size, verbose=1, seed=309, tensorboard_log=tr_log_dir, n_cpu_tf_sess=1) model.learn(total_timesteps=num_timesteps) model.save(f"{model_dir}/{model_name}") eps_done = 0 ep_rewards = np.array([0] * num_episodes) curr_rewards = 0 obs = env.reset() while eps_done != num_episodes: if eps_done % 10 == 0: print(f"Episodes completed: {eps_done} / {num_episodes}", end="\r") # For vectorised environments, they are automatically reset when done, # so returned obs would be the start state of next episode action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render(mode="human") curr_rewards += reward[0] if done[0]: ep_rewards[eps_done] = curr_rewards curr_rewards = 0 eps_done += 1 print("All episodes completed") env.close() mean = ep_rewards.mean() std_dev = ep_rewards.std() # Outliers: outside of 3 standard deviations outlier_threshold_upper = mean + 3 * std_dev outlier_threshold_lower = mean - 3 * std_dev trimmed_rewards = np.array([ rew for rew in ep_rewards if outlier_threshold_lower <= rew <= outlier_threshold_upper ]) avg_reward = trimmed_rewards.mean() print(f"Average score over {num_episodes} games: {avg_reward:.2f}") summary_writer = tf.summary.FileWriter(eval_log_dir) sess = tf.Session() rew_var = tf.Variable(0, dtype=tf.int64) rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var) for i in range(num_episodes): rew = ep_rewards[i] sess.run(rew_var.assign(rew)) summary_writer.add_summary(sess.run(rew_val), i) avg_var = tf.Variable(0.0, dtype=tf.float64) avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var) sess.run(avg_var.assign(avg_reward)) summary_writer.add_summary(sess.run(avg_val), 0) summary_writer.flush() summary_writer.close() sess.close()
param_noise=param_noise, action_noise=action_noise, tensorboard_log='./pretrain/DDPG/') elif choosenModel == 'PPO_2': from stable_baselines.common import make_vec_env from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO2 # make_vec_env() is used for multiprocess enviroment env = make_vec_env('gym_quadruped:quadruped-v0', n_envs=4) check_dir('./pretrain/PPO/') model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log='./pretrain/PPO/') else: print('Model choosen not available, check spelling or if it is supported') if args['baseModel'] is not None: print("Using trained model {}".format(args['baseModel'])) model.load(args['baseModel']) else: print("Training model from scratch") # This loop is used to save models and keep training to avoid losses of models and being able to choose quadruped # abilities at different stages for i in range(5): model.learn(total_timesteps=args['timesteps']) model.save("./TRPO/millon/largo3/trpo_{}_{} timesteps".format( i, args['timesteps']))
import gym from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import TRPO env = gym.make('CartPole-v1') # CartPole连杆游戏 env = DummyVecEnv([lambda: env]) model = TRPO(MlpPolicy, env, verbose=1) # 使用全连接网络模型 model.learn(total_timesteps=25000) # 训练 model.save("trpo_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) # 预测 obs, rewards, dones, info = env.step(action) # 运行 env.render() #绘制
use_hard_path = False, robot_link_length = robot_link_length) for i in range(n_cpu)]) if resume: print("resuming training") model = TRPO.load("ppo2_swimmer", env = env, verbose=1, tensorboard_log='./tf_logs/swimmer') else: print("not resuming") #two layers of size 64 model = TRPO(MlpPolicy, env, verbose=1, gamma = gamma, tensorboard_log='./tf_logs/swimmer') # #first, create the dataset # if pre_train: # model.pretrain() for i in range(100): model.learn(total_timesteps=250000, reset_num_timesteps = False) model.save("trpo_swimmer") # del model # remove to demonstrate saving and loading # #these are for testing # model = PPO2.load("ppo2_swimmer") # env = SwimmerLocomotionEnv( # path = fixed_path, # random_path = use_random_path, # use_hard_path = False, # robot_link_length = robot_link_length, # robot_k = robot_k, # record_trajectory = True) # # Testing purpose (should be in a seperate file)
from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import TRPO import mujoco_py import pybullet import pybullet_data import pybullet_envs if __name__ == "__main__": # multiprocess environment # for now, it doens't make sense to have multiple environment n_cpu = 1 env = DummyVecEnv([lambda: gym.make('Swimmer-v2') for i in range(n_cpu)]) #model = PPO2.load("ppo2_hopper", env = env, verbose=1, tensorboard_log='./tf_logs/hopper') model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log='./tf_logs') for i in range(100): model.learn(total_timesteps=250000, reset_num_timesteps=False) model.save("model/gym_swimmer/ppo2_swimmer_test_gym_step" + str(i)) # del model # remove to demonstrate saving and loading #model = PPO2.load("ppo2_cartpole") # # Enjoy trained agent # obs = env.reset() # while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()
start_time = time.time() if alg == 0: model = TRPO('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1) elif alg == 1: model = DQN('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1) elif alg == 2: model = ACKTR('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1) elif alg == 3: model = ACER('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1) elif alg == 4: model = A2C('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1) elif alg == 5: model = PPO1('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy model.learn(total_timesteps=int(args.total_iters)) model.save('{}_iters{}_{}_pursuitevasion_small'.format(algo_list[alg],int(args.total_iters),str(now.strftime('%Y%m%d')))) end_time = time.time() print('Training time for algorithm {}: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(algo_list[alg],\ end_time-start_time,(end_time-start_time)/60,(end_time-start_time)/3600)) print('Trained using RL') else: #test print('Testing {} learnt policy from model file {} for {} games!'.format(algo_list[alg],\ args.model,int(args.num_test))) start_time = time.time() if alg == 0: model = TRPO.load(args.model) elif alg == 1: model = DQN.load(args.model) elif alg == 2: model = ACKTR.load(args.model) elif alg == 3: