def train(self, world_model_path): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) losses = [] all_rewards = [] save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='env_model') saver = tf.train.Saver(var_list=save_vars) train_writer = tf.summary.FileWriter('./env_logs/train/', graph=sess.graph) summary_op = tf.summary.merge_all() if (self.n_envs == 1): envs = make_env()() else: envs = [make_env() for i in range(self.n_envs)] envs = SubprocVecEnv(envs) for idx, states, actions, rewards, next_states, dones in tqdm( self.generate_data(envs), total=self.max_ep_len): actions = np.array(actions) actions = np.reshape(actions, (-1, 1)) if (self.has_rewards): target_reward = reward_to_target(rewards) loss, reward_loss, state_loss, summary, _ = sess.run( [ self.loss, self.reward_loss, self.state_loss, summary_op, self.opt ], feed_dict={ self.states_ph: states, self.actions_ph: actions, self.target_states: next_states, self.target_rewards: target_reward }) else: loss, summary, _ = sess.run( [self.loss, summary_op, self.opt], feed_dict={ self.states_ph: states, self.actions_ph: actions, self.target_states: next_states, }) if idx % self.log_interval == 0: if (self.has_rewards): print( '%i => Loss : %.4f, Reward Loss : %.4f, Image Loss : %.4f' % (idx, loss, reward_loss, state_loss)) else: print('%i => Loss : %.4f' % (idx, loss)) saver.save(sess, '{}/env_model.ckpt'.format(world_model_path)) print('Environment model saved') train_writer.add_summary(summary, idx) envs.close()
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format( normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def _train(env_id, agent, model_params, total_steps, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize( envs) # normalize the envs during training and evaluation # Load pretrained model during training. if not is_evaluation and os.path.exists(agent + '_' + env_id): if agent == 'ppo2': model = PPO2.load(agent + '_' + env_id) elif agent == 'a2c': model = A2C.load(agent + '_' + env_id) else: if agent == 'ppo2': model = PPO2(MlpLstmPolicy, envs, nminibatches=1, verbose=1, **model_params) elif agent == 'a2c': model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params) model.learn(total_timesteps=total_steps) return envs, model
def create_env(n_envs, eval_env=False, no_log=False): global hyperparams, env_kwargs log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) if normalize: local_normalize_kwargs = {'norm_reward': False} env = VecNormalize(env, **local_normalize_kwargs) return env
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = SubprocVecEnv( [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)] ) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = SubprocVecEnv( [ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper) for i in range(n_envs) ] ) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs["norm_reward"] = False else: local_normalize_kwargs = {"norm_reward": False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print(f"Normalization activated: {local_normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get("frame_stack", False): n_stack = hyperparams["frame_stack"] env = VecFrameStack(env, n_stack) print(f"Stacking {n_stack} frames") if is_image_space(env.observation_space): if args.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print(f"Normalization activated: {normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print(f"Stacking {n_stack} frames") if is_image_space(env.observation_space): if args.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def create_env(n_envs): """ Create the environment and wrap it if necessary :param n_envs: (int) :return: (gym.Env) """ global hyperparams if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif args.algo in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print( "WARNING: normalization not supported yet for DDPG/DQN" ) # No env_wrapper applied for now as not using make_env() env = gym.make(env_id) env.seed(args.seed) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def play(args, **kwargs): """ play mode """ print(args.dir) assert args.dir, 'Please provide directory where checkpoint file is located' kwargs['normalize'] = True normed_env = U.make_env(**kwargs) # use env.setup() after session creation to apply mean/std to obs and rewards model = ImpalaModel(observation_shape=normed_env.observation_space.shape, n_actions=normed_env.action_space.n, learning_rate=0.01, entropy_scale=0.0) # max_steps = 10000 # hooks = [tf.train.StopAtStepHook(last_step=max_steps)] # , PyProcessHook()] print('Restore from:', args.dir) with tf.train.SingularMonitoredSession(checkpoint_dir=args.dir) as sess: normed_env.setup(session=sess) # restore values for running mean/std print('Restored from global step:', sess.run(model.global_step)) try: done = False obs = normed_env.reset() print(obs) while not done: normed_env.render() action, _ = model.get_action_and_prob(session=sess, observation=obs) obs, reward, done, info = normed_env.step(action) except KeyboardInterrupt: print('got KeyboardInterrupt') finally: pass
def main(args): # Create directories if not os.path.exists("./logs"): os.makedirs("./logs") # Set logs log = set_log(args) # Create env env = make_env(log, args) # Set seeds random.seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Visualize environment observations = env.reset() for _ in range(args.ep_max_timesteps): env.render() prey_action = env.action_space.sample() predator1_action = env.action_space.sample() predator2_action = env.action_space.sample() actions = [prey_action, predator1_action, predator2_action] observations, reward, done, _ = env.step(actions) if done: break
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4) optim, optimize = dqn.optimize(learning_rate=0.0001) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=3000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=1024, batch_size=16, min_buffer_size=20000)
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.train_envs, self.test_envs = utils.make_env(cfg) cfg.agent.params.obs_dim = self.train_envs[0].observation_space.shape[0] + cfg.noise_dims cfg.agent.params.action_dim = self.train_envs[0].action_space.shape[0] if cfg.agent.name != 'sac': cfg.agent.params.num_envs = cfg.num_train_envs cfg.agent.params.action_range = [ float(self.train_envs[0].action_space.low.min()), float(self.train_envs[0].action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.agent.seq_len = cfg.seq_len self.replay_buffer = MultiEnvReplayBuffer((cfg.agent.params.obs_dim,), # hard coded self.train_envs[0].action_space.shape, int(cfg.replay_buffer_capacity), self.device, num_envs=cfg.num_train_envs, seq_len=cfg.seq_len) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = [0] * cfg.num_train_envs
def main(_): env_name = XMAGICAL_EMBODIMENT_TO_ENV_NAME[FLAGS.embodiment] env = utils.make_env(env_name, seed=0) # Reward learning wrapper. if FLAGS.config.reward_wrapper.pretrained_path is not None: env = utils.wrap_learned_reward(env, FLAGS.config) viewer = KeyboardEnvInteractor(action_dim=env.action_space.shape[0]) env.reset() obs = env.render("rgb_array") viewer.imshow(obs) i = [0] rews = [] def step(action): obs, rew, done, info = env.step(action) rews.append(rew) if obs.ndim != 3: obs = env.render("rgb_array") if done: print(f"Done, score {info['eval_score']:.2f}/1.00") print("Episode metrics: ") for k, v in info["episode"].items(): print(f"\t{k}: {v}") if FLAGS.exit_on_done: return i[0] += 1 return obs viewer.run_loop(step) utils.plot_reward(rews)
def process_func(proc_idx, params, replay_buffer, model, state_normalizer, goal_normalizer, log_func): env = make_env(params, proc_idx) w = Worker(proc_idx, params, env, replay_buffer, model, state_normalizer, goal_normalizer, log_func) print(f"Spawning worker with id: {proc_idx}") w.loop()
def fit(): env, args = make_env() env.render() # Assumes the agent shares the same model policy = MARL_MBPO(args) agents = [policy for i in range(env.n)] rewards = [] for time_step in tqdm(range(args.time_steps)): if time_step % args.maximum_episode_length == 0: observations = env.reset() # Make a copy; changed in environment transition initial_obs = copy.deepcopy(observations) actions = [] for i, observation in enumerate(observations): actions.append(agents[i].action(observation)) observations, rewards, done, _ = env.step(actions) # Make a copy; changed in environment transition next_obs = copy.deepcopy(observations) # Store into the buffer policy.model_buffer.store(initial_obs, actions, next_obs, rewards) env.render() if time_step > args.batch_size: policy.train()
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = utils.make_env(cfg) cfg.agent.params.obs_dim = self.env.observation_space.shape[0] cfg.agent.params.action_dim = self.env.action_space.shape[0] cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, int(cfg.replay_buffer_capacity), self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False): print("Testing:") print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}") return total_reward, distance_x
def measure_change_through_time(path, env_name, policy, rep_params): env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length']) global metrics metrics = ['CCA'] sanity_task = env.sample_tasks(1) with torch.no_grad(): env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) sanity_ep = env_task.run(policy, episodes=1) init_change_m = defaultdict(list) init_change_v = defaultdict(list) adapt_change_m = defaultdict(list) adapt_change_v = defaultdict(list) checkpoints = path + f'/model_checkpoints/' i = 0 file_list = os.listdir(checkpoints) file_list = [file for file in file_list if 'baseline' not in file] models_list = {} for file in file_list: n_file = file.split('_')[-1] n_file = n_file.split('.')[0] n_file = int(n_file) models_list[n_file] = f'model_{n_file}.pt' prev_policy = policy for key in sorted(models_list.keys()): model_chckpnt = models_list[key] if i > 40: break i += 1 print(f'Loading {model_chckpnt} ...') chckpnt_policy = DiagNormalPolicy(9, 4) chckpnt_policy.load_state_dict(torch.load(os.path.join(checkpoints, model_chckpnt))) chckpnt_policy = MAML(chckpnt_policy, lr=rep_params['inner_lr']) mean, variance = episode_mean_var(sanity_ep, policy, chckpnt_policy, layer=6) a_mean, a_variance = episode_mean_var(sanity_ep, prev_policy, chckpnt_policy, layer=6) init_change_m['CCA'] += [mean['CCA']] init_change_v['CCA'] += [variance['CCA']] adapt_change_m['CCA'] += [a_mean['CCA']] adapt_change_v['CCA'] += [a_variance['CCA']] prev_policy = chckpnt_policy for metric in metrics: plot_sim_across_steps(init_change_m[metric], init_change_v[metric], metric=metric, title='Similarity between init and adapted (in %)') for metric in metrics: difference = [1 - x for x in adapt_change_m[metric]] plot_sim_across_steps(difference, adapt_change_v[metric], metric=metric, title='Representation difference after each step (in %)')
def main(): script_start = str(datetime.datetime.now()).replace(':', '-').replace(' ', 'T') args = utils.parse_args() args.script_start = script_start args_path = Path(f'args_{script_start}.json') with open(args_path, 'w') as f: json.dump(vars(args), f, indent=4) utils.setup(args.use_sb3, args.debug_nans) eval_seed = args.seed if eval_seed is not None: eval_seed += args.num_envs # ---------------- TRAINING STARTS HERE ---------------- # Set up gym environment env = utils.make_env(args, include_norm=True) # Set up model model = setup_model(args, env) callbacks = [] utils.append_callback(callbacks, utils.create_save_callback(args)) utils.append_callback(callbacks, utils.create_eval_callback(args)) dry_run(model, env, int(args.warmup_steps)) env.seed(args.seed) start_time = time.perf_counter() # Train the model (need to put at least 100k steps to # see something) model.learn(total_timesteps=int(args.steps), callback=callbacks) duration = time.perf_counter() - start_time print(f'Training took {duration} seconds.') # env.envs[0].plot_rewards() print('Number of episodes in each environment:', [env_.num_episodes for env_ in env.envs]) model_fname = Path(f'sdc_model_{args.model_class.lower()}_' f'{args.policy_class.lower()}_{script_start}.zip') model.save(str(model_fname)) env_fname = Path(f'sdc_env_{script_start}.pkl') utils.save_env(env_fname, env) # delete trained model to demonstrate loading, not really necessary # del model # delete trained model to demonstrate loading, not really necessary # del model # ---------------- TESTING STARTS HERE ---------------- fig_path = Path(f'results_{script_start}.pdf') run_tests(model, args, seed=eval_seed, fig_path=fig_path)
def make_envs(procs, env_name, seed, extrap_min, extrap_max): envs = [] for i in range(procs): env = utils.make_env(env_name, seed + 100000 * i, {"extrapolate_min": extrap_min, "extrapolate_max": extrap_max}) envs.append(env) env = ParallelEnv(envs) print("Environments loaded\n") return env
def main(num_games=10, load_checkpoint=False, env_name='PongNoFrameskip-v4'): env = make_env(env_name) best_score = -np.inf agent = DQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001, input_dims=(env.observation_space.shape), n_actions=env.action_space.n, mem_size=20000, eps_min=0.1, batch_size=32, replace=1000, eps_dec=1e-5, chkpt_dir='models/', algo='DQNAgent', env_name=env_name) if load_checkpoint: agent.load_models() fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \ + str(num_games) + 'games' figure_file = 'plots/' + fname + '.png' n_steps = 0 scores, eps_history, steps_array = [], [], [] for i in range(num_games): done = False observation = env.reset() score = 0 while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward if not load_checkpoint: agent.store_transition(observation, action, reward, observation_, int(done)) agent.learn() observation = observation_ scores.append(score) steps_array.append(n_steps) avg_score = np.mean(scores[-100:]) print('episode: ', i, 'score: ', score, ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) if avg_score > best_score: #if not load_checkpoint: # agent.save_models() best_score = avg_score eps_history.append(agent.epsilon) if load_checkpoint and n_steps >= 18000: break x = [i + 1 for i in range(len(scores))] plot_learning_curve(steps_array, scores, eps_history, figure_file)
def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env
def main(): args = parse_args() env = make_env(args.env) model = get_model(args.policy_ckpt_dir) if args.reward_predictor_ckpt_dir: reward_predictor = get_reward_predictor(args.reward_predictor_ckpt_dir) else: reward_predictor = None run_agent(env, model, reward_predictor, args.frame_interval_ms)
def __call__(self, config): # Set random seeds: PyTorch, numpy.random, random set_global_seeds(seed=config['seed']) # Create environment and seed it env = make_env(seed=config['seed'], monitor=False, monitor_dir=None) # Create environment specification env_spec = EnvSpec(env) # TODO: integrate within make_env globally # Create device device = torch.device('cuda' if config['cuda'] else 'cpu') # Create logger logger = Logger(name='logger') # Create policy network = MLP(config=config) policy = CategoricalPolicy(network=network, env_spec=env_spec) policy.network = policy.network.to(device) # Create optimizer optimizer = optim.Adam(policy.network.parameters(), lr=config['lr']) # Learning rate scheduler max_epoch = config['train_iter'] # Max number of lr decay, Note where lr_scheduler put lambda_f = lambda epoch: 1 - epoch/max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent agent_class = ActorCriticAgent#REINFORCEAgent agent = agent_class(policy=policy, optimizer=optimizer, config=config, lr_scheduler=lr_scheduler, device=device) # Create runner runner = Runner(agent=agent, env=env, gamma=config['gamma']) # Create engine engine = Engine(agent=agent, runner=runner, config=config, logger=logger) # Training train_output = engine.train() np.save('logs/returns_ActorCritic', train_output) return None
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"): print("Testing:") total_rewards = [] distance_xs = [] for i in range(test_n): print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}") eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed+i) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}") total_rewards.append(total_reward) distance_xs.append(distance_x) # avoid yaml turn float64 to numpy array total_rewards = [float(x) for x in total_rewards] distance_xs = [float(x) for x in distance_xs] data = { "title": "test", "train": train, "test": test, "total_reward": total_rewards, "distance_x": distance_xs, } with open(f"{save_file}", "w") as f: yaml.dump(data, f)
def _train(env_id, model_params, total_epochs, use_sigmoid_layer=False, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize(envs) # normalize the envs during training and evaluation # activation fn: use tanh for delta hedging and relu for mean reversion # learning rate: use 1e-7 for delta hedging and 1e-5 for mean reversion if use_sigmoid_layer: model = PPO2(SigmoidMlpPolicy, envs, n_steps=1, nminibatches=1, learning_rate=lambda f: f * 1e-5, verbose=1, policy_kwargs=dict(act_fun=tf.nn.relu), **model_params) else: model = PPO2(MlpLstmPolicy, envs, n_steps=1, nminibatches=1, learning_rate=lambda f: f * 1e-5, verbose=1, policy_kwargs=dict(act_fun=tf.nn.relu), **model_params) model.learn(total_timesteps=total_epochs * L) return envs, model
def __init__(self): self.env = make_env(scenario_name='scenarios/new_env')#'simple_spread') self.num_agents = self.env.n self.agents = [DDPGAgent(self.env, agent_id, actor_lr=0.0, critic_lr=0.0, gamma=1.0) for agent_id in range(self.num_agents)] for agent in self.agents: #agent.actor.load_state_dict(torch.load('./saved_weights/actor_3000.weights', map_location=torch.device('cpu'))) #agent.critic.load_state_dict(torch.load('./saved_weights/critic_3000.weights', map_location=torch.device('cpu'))) pass self.reset()
def sanity_check(env_name, model_1, model_2, rep_params): # Sample a sanity batch env = make_env(env_name, 1, rep_params['seed'], max_path_length=rep_params['max_path_length']) env.active_env.random_init = False sanity_task = env.sample_tasks(1) with torch.no_grad(): env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) init_sanity_ep = env_task.run(model_1, episodes=1) env.set_task(sanity_task[0]) env.seed(rep_params['seed']) env.reset() env_task = Runner(env) adapt_sanity_ep = env_task.run(model_2, episodes=1) env_task.reset() adapt_2_sanity_ep = env_task.run(model_2, episodes=1) init_san_rew = init_sanity_ep.reward().sum().item() adapt_san_rew = adapt_sanity_ep.reward().sum().item() adapt_2_san_rew = adapt_2_sanity_ep.reward().sum().item() # print(f'Why are these not equal? They should be equal: {init_san_rew}={adapt_san_rew}={adapt_2_san_rew}') # assert (init_san_rew == adapt_san_rew), "Environment initial states are random" init_sanity_state = init_sanity_ep[0].state init_rep_sanity = model_1.get_representation(init_sanity_state) init_rep_sanity_2 = model_1.get_representation(init_sanity_state, layer=3) adapt_rep_sanity = model_2.get_representation(init_sanity_state) adapt_rep_sanity_2 = model_2.get_representation(init_sanity_state, layer=3) init_rep_array = init_rep_sanity.detach().numpy() init_rep_2_array = init_rep_sanity_2.detach().numpy() adapt_rep_array = adapt_rep_sanity.detach().numpy() adapt_rep_2_array = adapt_rep_sanity_2.detach().numpy() print(f'Are the representations of the two models for the same state identical? ' f'{np.array_equal(init_rep_array, adapt_rep_array)}') assert np.array_equal(init_rep_array, adapt_rep_array), "Representations not identical" assert np.array_equal(init_rep_2_array, adapt_rep_2_array), "Representations not identical"
def _eval_model(model, env_id, ob_shape, num_eps, plot=False): test_env = SubprocVecEnv([make_env(env_id)]) sharpe_ratios = [] for episode in range(num_eps): # Padding zeros to the test env to match the shape of the training env. zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape) zero_completed_obs[0, :] = test_env.reset() state = None for _ in range(L): action, state = model.predict(zero_completed_obs, state=state, deterministic=True) zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0] sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0]) if plot: test_env.env_method('render', indices=0) test_env.close() # Return the average sharpe ratio return sum(sharpe_ratios) / len(sharpe_ratios)
def create_env(env_params): global hyperparams if algo_ in ['dqn']: env = gym.make(env_id, env_params=env_params) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: env = DummyVecEnv([make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, env_params=env_params)]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format(normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) return env
def eval_policy(policy, env_name, seed, eval_episodes=10): eval_env, _, _, _ = utils.make_env(env_name, atari_preprocessing) eval_env.seed(seed + 100) avg_reward = 0. for _ in range(eval_episodes): state, done = eval_env.reset(), False while not done: action = policy.select_action(np.array(state), eval=True) state, reward, done, _ = eval_env.step(action) avg_reward += reward avg_reward /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") return avg_reward