def train(self, sess=None): sess = self.sess created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") logger.log("Collecting both agent and oracle samples...") paths, agent_only_paths = self.obtain_samples( itr, self.oracle_policy) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) agent_samples_data = self.process_agent_samples( itr, agent_only_paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) self.log_diagnostics(agent_only_paths) #### optimising the policy based on the collected samples logger.log("Optimizing policy...") self.optimize_agent_policy(itr, agent_samples_data) self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to" "continue...") self.shutdown_worker() if created_session: sess.close()
def eval_performance(policy, env, period, max_path_length, num_rollouts, seed=0): # import ipdb; ipdb.set_trace() # change the policy period # do the rollouts and aggregate the performances ext.set_seed(seed) returns = [] if isinstance(policy, HierarchicalPolicyRandomTime): with policy.fix_period(period): for _ in trange(num_rollouts): returns.append( np.sum( rollout(env, policy, max_path_length=max_path_length)['rewards'])) # policy.curr_period = period # policy.random_period = False # with policy.manager.set_std_to_0(): # for _ in trange(num_rollouts): # returns.append(np.sum(rollout(env, policy, max_path_length=max_path_length)['rewards'])) else: policy.period = period # with policy.manager.set_std_to_0(): for _ in trange(num_rollouts): returns.append( np.sum( rollout(env, policy, max_path_length=max_path_length)['rewards'])) return returns
def validate(self, itr, objs): summaries = [] keys = objs.keys() if 'samples_data' in keys: summaries += self._summarize_samples_data(objs['samples_data']) if 'env' in keys: # extract some relevant, wrapped environments normalized_env = hgail.misc.utils.extract_wrapped_env( objs['env'], NormalizedEnv) if normalized_env is None: normalized_env = hgail.misc.utils.extract_wrapped_env( objs['env'], VectorizedNormalizedEnv) julia_env = hgail.misc.utils.extract_wrapped_env( objs['env'], JuliaEnv) summaries += self._summarize_obs_mean_std( normalized_env._obs_mean, np.sqrt(normalized_env._obs_var), self.obs_mean, self.obs_std, julia_env.obs_names()) # render a trajectory, this must save to file on its own if self.render and 'env' in keys and 'policy' in keys and ( itr % self.render_every) == 0: if objs['env'].vectorized: vectorized_render_rollout(objs['env'], objs['policy'], max_path_length=200) else: rollout(objs['env'], objs['policy'], animated=True, max_path_length=200) self.write_summaries(itr, summaries)
def _worker_start(): env = None policy = None max_length = None try: while True: msgs = {} # Only fetch the last message of each type while True: try: msg = queue.get_nowait() msgs[msg[0]] = msg[1:] except Empty: break if 'stop' in msgs: break elif 'update' in msgs: env, policy = msgs['update'] # env.start_viewer() elif 'demo' in msgs: param_values, max_length = msgs['demo'] policy.set_param_values(param_values) rollout(env, policy, max_path_length=max_length, animated=True, speedup=5) else: if max_length: rollout(env, policy, max_path_length=max_length, animated=True, speedup=5) except KeyboardInterrupt: pass
def generate_expert_dp(): env = TfEnv(normalize(InvertedPendulumEnv())) policy = GaussianMLPPolicy( name="expert_policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 64), std_hidden_sizes=(64, 64), adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=64, discount=0.995, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), gae_lambda=0.97, ) with tf.Session() as sess: algo.train(sess=sess) t = rollout(env=env, agent=policy, max_path_length=100, animated=False) print(sum(t['rewards'])) with open('expert_dp.pickle', 'wb') as handle: pickle.dump(policy, handle) while True: rollout(env=env, agent=policy, max_path_length=100, animated=False)
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() AvgDisReturn = [] AvgReturn = [] for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) #print(paths) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) # for key in samples_data: # print(key) # print(samples_data["rewards"]) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) AvgDisReturn.append( float(dict(logger._tabular)["AverageDiscountedReturn"])) AvgReturn.append(float(dict(logger._tabular)["AverageReturn"])) # for key in dict(logger._tabular): # print(key) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") store("AvgDisReturn.dat", AvgDisReturn) store("AvgReturn.dat", AvgReturn) self.shutdown_worker() if created_session: sess.close()
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() global_step = tf.train.get_or_create_global_step() global_step_inc = global_step.assign_add(1) sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() total_timesteps = 0 for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") with _MeasureTime('ObtainSamplesTime'): paths = self.obtain_samples(itr) logger.log("Processing samples...") with _MeasureTime('ProcessPathsTime'): self.process_paths(paths) with _MeasureTime('ProcessSamplesTime'): samples_data = self.process_samples(itr, paths) timesteps = len(samples_data['observations']) total_timesteps += timesteps logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") with _MeasureTime('OptimizePolicyTime'): self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.record_tabular('Timesteps', timesteps) logger.record_tabular('TotalTimesteps', total_timesteps) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") sess.run(global_step_inc) self.shutdown_worker() if created_session: sess.close()
def optimize(self, iter=0): # get paths n_starts = len(self.start_states) for itr in range(self.algo_alice.n_itr): paths_alice = [] paths_bob = [] new_start_states = [] for i in range(self.num_rollouts): self.env_alice.update_start_generator( FixedStateGenerator(self.start_states[i % n_starts])) paths_alice.append( rollout(self.env_alice, self.policy_alice, max_path_length=self.max_path_length, animated=False)) alice_end_obs = paths_alice[i]['observations'][-1] new_start_state = self.env_alice._obs2start_transform( alice_end_obs) new_start_states.append(new_start_state) self.env_bob.update_start_generator( FixedStateGenerator(new_start_state)) paths_bob.append( rollout(self.env_bob, self.policy_bob, max_path_length=self.max_path_length, animated=False)) # update rewards paths_alice, paths_bob = self.update_rewards( paths_alice=paths_alice, paths_bob=paths_bob, gamma=self.gamma) # optimize policies if self.optimize_alice: self.algo_alice.start_worker() self.algo_alice.init_opt() training_samples_alice = self.algo_alice.sampler.process_samples( itr=iter, paths=paths_alice) self.algo_alice.optimize_policy( itr=iter, samples_data=training_samples_alice) if self.optimize_bob: self.algo_bob.start_worker() self.algo_bob.init_opt() training_samples_bob = self.algo_bob.sampler.process_samples( itr=iter, paths=paths_bob) self.algo_bob.optimize_policy( itr=iter, samples_data=training_samples_bob) return np.array(new_start_states)
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data, self._wandb_dict) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") if self._render: fn = self._gif_header + str(itr) + '.gif' # obtain gym.env from rllab.env render_env(self.env.wrapped_env.env, path=self._gif_dir, filename=fn) if self._log_wandb: full_fn = os.path.join(os.getcwd(), self._gif_dir, fn) wandb.log({ "video": wandb.Video(full_fn, fps=60, format="gif") }) if self._log_wandb: wandb.log(self._wandb_dict) self.shutdown_worker() if created_session: sess.close()
def _worker_collect_one_path(G, max_path_length, scope=None): G = _get_scoped_G(G, scope) path = rollout(G.env, G.policy, max_path_length) if 'broke_sim' in path['env_infos']: while path['env_infos']['broke_sim'][-1]: path = rollout(G.env, G.policy, max_path_length) return [path], len(path["rewards"]) '''if not hasattr(G.env._wrapped_env, 'env'):
def collect_demo(G, demo_seed, analogy_seed, target_seed, env_cls, demo_policy_cls, horizon): demo_env = env_cls(seed=demo_seed, target_seed=target_seed) analogy_env = env_cls(seed=analogy_seed, target_seed=target_seed) demo_path = rollout(demo_env, demo_policy_cls(demo_env), max_path_length=horizon) analogy_path = rollout(analogy_env, demo_policy_cls(analogy_env), max_path_length=horizon) return demo_path, analogy_path, demo_env, analogy_env
def simulate_policy(args): with tf.Session(): data = joblib.load(args.file) if 'algo' in data.keys(): policy = data['algo'].policy env = data['algo'].env else: policy = data['policy'] env = data['env'] while True: rollout(env, policy, max_path_length=args.max_path_length, animated=True, speedup=args.speedup)
def train(self, sess=None, interaction_policy=None, log_dir=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) interaction_policy.load_models() self.start_worker() # Load tf models in interaction policy start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") if itr % 200 == 0 and log_dir is not None: pickle.dump( self, open(log_dir + "/algo_itr_" + str(itr) + ".p", "wb")) self.shutdown_worker() if created_session: sess.close()
def _worker_rollout_policy(G, args): sample_std = args["sample_std"].flatten() cur_mean = args["cur_mean"].flatten() n_evals = args["n_evals"] K = len(cur_mean) params = np.random.standard_normal(K) * sample_std + cur_mean G.policy.set_param_values(params) paths, returns, undiscounted_returns = [], [], [] for _ in range(n_evals): path = rollout(G.env, G.policy, args["max_path_length"]) path["returns"] = discount_cumsum(path["rewards"], args["discount"]) path["undiscounted_return"] = sum(path["rewards"]) paths.append(path) returns.append(path["returns"]) undiscounted_returns.append(path["undiscounted_return"]) result_path = {'full_paths':paths} result_path['undiscounted_return'] = _get_stderr_lb(undiscounted_returns) result_path['returns'] = _get_stderr_lb_varyinglens(returns) # not letting n_evals count towards below cases since n_evals is multiple eval for single paramset if args["criterion"] == "samples": inc = len(path["rewards"]) elif args["criterion"] == "paths": inc = 1 else: raise NotImplementedError return (params, result_path), inc
def compute_alice_reward(self, next_obs): alice_end_obs = next_obs if self.start_generation: bob_start_state = self._obs2start_transform(alice_end_obs) self.env_bob.update_start_generator( FixedStateGenerator(bob_start_state)) else: bob_goal_state = self._obs2goal_transform(alice_end_obs) self.env_bob.update_goal_generator( FixedStateGenerator(bob_goal_state)) path_bob = rollout( self.env_bob, self.policy_bob, max_path_length=max(5, self.max_path_length - self.time), # animated=False) t_alice = self.time t_bob = path_bob['rewards'].shape[0] reward = self.gamma * max( 0, self.alice_bonus + t_bob - self.alice_factor * t_alice) # print("t_bob: " + str(t_bob) + ", np.linalg.norm(bob_start_state): " + str(np.linalg.norm(bob_start_state))) # print("t_alice: " + str(t_alice), " speed: " + str(np.linalg.norm(bob_start_state) / t_alice)) # print("reward: " + str(reward)) return reward
def test_rand_step_adv(env, protag_policy, path_length=100, n_traj=5, render=False): paths = [] sum_rewards = 0.0 characteristic_length = path_length / 5 step_size = path_length / 10 for _ in range(n_traj): adv_policy = StepControlPolicy( env_spec=env.spec, characteristic_length=characteristic_length, step_size=step_size, is_random_mag=True, is_protagonist=False, ) path = rollout(env, protag_policy, path_length, adv_agent=adv_policy, animated=render, test=True) sum_rewards += path['rewards'].sum() paths.append(path) avg_rewards = sum_rewards / n_traj return avg_rewards
def main(): args = parse_arguments() profiler = cProfile.Profile() data = joblib.load(args.file) policy = data['policy'] env = data['env'] plt.ion() # Set fixed random seed np.random.seed(9) # Sample one rollout profiler.enable() path = rollout(env, policy, max_path_length=args.max_path_length, animated=args.render, speedup=args.speedup, always_return_paths=True) profiler.disable() # Policy analysis profile_code(profiler) plot_curve(path['env_infos']['dist'], 'Distance', 'm') plot_curve(path['env_infos']['vel'], 'Velocity', 'm/s') plot_distribution(path['env_infos']['dist'], 'Distance', 'm') plot_distribution(path['env_infos']['vel'], 'Velocity', 'm/s') # Block until key is pressed sys.stdout.write("Press <enter> to continue: ") input()
def main(): parser = argparse.ArgumentParser() parser.add_argument('policy_file', type=str) parser.add_argument('--vid', type=str, default='/tmp/madrl.mp4') parser.add_argument('--verbose', action='store_true', default=False) parser.add_argument('--n_steps', type=int, default=200) parser.add_argument('--map_file', type=str, default='') args = parser.parse_args() policy_dir = osp.dirname(args.policy_file) params_file = osp.join(policy_dir, 'params.json') # Load file with open(params_file) as data_file: train_args = json.load(data_file) print('Loading parameters from {} in {}'.format(policy_dir, 'params.json')) with tf.Session() as sess: data = joblib.load(args.policy_file) policy = data['policy'] env = data['env'] if train_args['control'] == 'centralized': paths = rollout(env, policy, max_path_length=args.n_steps, animated=True) elif train_args['control'] == 'decentralized': paths = decrollout(env, policy, max_path_length=args.n_steps, animated=True) """
def evaluate_state(state, env, policy, horizon, n_traj=1, full_path=False, key='rewards', as_goals=True, aggregator=(np.sum, np.mean)): aggregated_data = [] paths = [] if as_goals: env.update_goal_generator(FixedStateGenerator(state)) else: env.update_start_generator(FixedStateGenerator(state)) for j in range(n_traj): paths.append(rollout(env, policy, horizon)) if key in paths[-1]: aggregated_data.append(aggregator[0](paths[-1][key])) else: aggregated_data.append(aggregator[0](paths[-1]['env_infos'][key])) mean_reward = aggregator[1](aggregated_data) if full_path: return mean_reward, paths return mean_reward
def _worker_rollout_policy(G, args): sample_std = args["sample_std"].flatten() cur_mean = args["cur_mean"].flatten() n_evals = args["n_evals"] K = len(cur_mean) params = np.random.standard_normal(K) * sample_std + cur_mean G.policy.set_param_values(params) paths, returns, undiscounted_returns = [], [], [] for _ in range(n_evals): path = rollout(G.env, G.policy, args["max_path_length"]) path["returns"] = discount_cumsum(path["rewards"], args["discount"]) path["undiscounted_return"] = sum(path["rewards"]) paths.append(path) returns.append(path["returns"]) undiscounted_returns.append(path["undiscounted_return"]) result_path = {'full_paths': paths} result_path['undiscounted_return'] = _get_stderr_lb(undiscounted_returns) result_path['returns'] = _get_stderr_lb_varyinglens(returns) # not letting n_evals count towards below cases since n_evals is multiple eval for single paramset if args["criterion"] == "samples": inc = len(path["rewards"]) elif args["criterion"] == "paths": inc = 1 else: raise NotImplementedError return (params, result_path), inc
def _worker_collect_one_path(G, max_path_length, itr, obs_mean, obs_std, act_mean, act_std): # Path rollout. path = rollout(G.env, G.policy, max_path_length) # Computing intrinsic rewards. # ---------------------------- # Save original reward. path['rewards_extrinsic'] = np.array(path['rewards']) if itr > 0: # Iterate over all paths and compute intrinsic reward by updating the # model on each observation, calculating the KL divergence of the new # params to the old ones, and undoing this operation. obs = (path['observations'] - obs_mean) / (obs_std + 1e-8) act = (path['actions'] - act_mean) / (act_std + 1e-8) rew = path['rewards'] # inputs = (o,a), target = o' obs_nxt = np.vstack([obs[1:]]) _inputs = np.hstack([obs[:-1], act[:-1]]) _targets = obs_nxt surprise = np.zeros(rew.shape) surprise[:len(_inputs)] = G.dynamics.surprise_fn(_inputs, _targets) surprise[-1] = surprise[-2] # Stuff it in path path['surprise'] = surprise # ---------------------------- return path, len(path["rewards"])
def test_expert_reacher(): with tf.Session() as sess: env = TfEnv(normalize(ReacherEnv())) expert = load_expert_reacher(env, sess) while True: t = rollout(env=env, agent=expert, max_path_length=50, animated=True) print(np.mean(sum(t['rewards'])))
def ant_evaluate(env, policy, init_state=None, max_path_length=2000, animated=True, speedup=2): if init_state is not None: if len(init_state) == 2: init_state.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) # first two positions are COM env.update_start_generator(FixedStateGenerator(init_state)) path = rollout(env, policy, max_path_length=max_path_length, animated=animated, speedup=speedup) print("Trajectory length: {}".format(len(path["rewards"]))) print("Success: {}".format(path["rewards"][-1])) return path["rewards"][-1]
def get_max_reward(env, policy, num_trajs=200): best_reward = 0.0 for _ in range(num_trajs): info = rollout(env, policy) print ("Finished traj", _) best_reward = max(best_reward, np.sum(info['rewards'])) print ("Max reward: ", best_reward)
def get_velocities(policy, env, max_path_length, num_rollouts, seed=0): ext.set_seed(seed) angles = [] for _ in trange(num_rollouts): rollout_result = rollout(env, policy, max_path_length=max_path_length) angles.append(rollout_result['env_infos']['joint_angles']) return angles
def rollout_row(train_config_num, env_ind, env, q): mean_rollouts = np.zeros(len(phi_configs)) std_rollouts = np.zeros(len(phi_configs)) # iterate over test configurations for test_config_num, test_config in enumerate(phi_configs): print("train config num : {}".format(train_config_num)) print("test config num : {}".format(test_config_num)) rollouts = [] # iterate over agents for agent_num in range(num_agents): real_config_num = train_config_num - 1 if train_config_num == 0: real_config_num = "nominal" file_str = '../policies_curriculum/{}/policy_{}_config_{}_agent_{}'.format( dynamic_environments[env_ind], dynamic_environments[env_ind], real_config_num, agent_num) # read in the agent's policy policy = loadModel(file_str) if train_config_num == 0: # set configuration for nominal policy policy.set_config(test_config) curriculum = None else: # note that policy config is set through the curriculum # by having only one element, we ensure this is the config during rollouts assert (isinstance(policy, CurriculumPolicy)) curriculum = [test_config] cum_rewards = [] for i in range(num_rollouts): rollout_dict = rollout(env=env, agent=policy, max_path_length=env.horizon, curriculum=curriculum) cum_rewards.append(np.sum(rollout_dict["rewards"])) rollouts.append(cum_rewards) mean_rollouts[test_config_num] = np.mean(rollouts) std_rollouts[test_config_num] = np.std(rollouts) q.put((train_config_num, test_config_num, mean_rollouts[test_config_num], std_rollouts[test_config_num])) # write to file in case something weird with multiproc happens... saveModel([mean_rollouts, std_rollouts], 'rollouts_{}_config_{}'.format(dynamic_environments[env_ind], train_config_num)) print("GOT HERE {}".format(train_config_num)) return
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session(config=get_session_config()) sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] save_itr_params_pickle(itr, params) prune_old_snapshots(itr, keep_every=self.snap_keep_every, keep_latest=self.snap_keep_latest) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker() if created_session: sess.close()
def simulate_policy(args): with tf.Session() as sess: data = joblib.load(args.file) if 'algo' in data.keys(): policy = data['algo'].policy # env = data['algo'].env else: policy = data['policy'] # env = data['env'] SIM_TIMESTEP = 0.01 FRAME_SKIP = 1 DT = SIM_TIMESTEP * FRAME_SKIP env_params = dict( is_render=True, obs_with_img=False, active_joints='RA', control_mode='tasktorque', # _control_mode='torque', # _control_mode='velocity', sim_timestep=SIM_TIMESTEP, frame_skip=FRAME_SKIP, obs_distances=False, balance_cost_weight=2.0, fall_cost_weight=2.0, tgt_cost_weight=2.0, balance_done_cost= 2.0, #*PATH_LENGTH, # TODO: dont forget same balance weight tgt_done_reward=2.0, # tgt_cost_weight=5.0, # balance_cost_weight=0.0, # fall_cost_weight=0.0, # tgt_cost_weight=0.0, # balance_cost_weight=5.0, # fall_cost_weight=7.0, ctrl_cost_weight=1.0e-1, use_log_distances=True, log_alpha_pos=1e-4, log_alpha_ori=1e-4, goal_tolerance=0.05, min_obj_height=0.60, max_obj_height=1.20, max_obj_distance=0.20, max_time=None, ) env = normalize(CentauroTrayEnv(**env_params)) with policy.deterministic(args.deterministic): while True: path = rollout(env, policy, max_path_length=args.max_path_length, animated=True, speedup=args.speedup) input("Press a key to re-sample...")
def visualizer_rllab(args): """Visualizer for rllab experiments. This function takes args (see function create_parser below for more detailed information on what information can be fed to this visualizer), and renders the experiment associated with it. """ # extract the flow environment data = joblib.load(args.file) policy = data['policy'] env = data['env'] # FIXME(ev, ak) only one of these should be needed # unwrapped_env = env._wrapped_env._wrapped_env.env.unwrapped # unwrapped_env = env.wrapped_env.env.env.unwrapped # if this doesn't work, try the one above it unwrapped_env = env._wrapped_env.env.unwrapped # Set sumo to make a video sim_params = unwrapped_env.sim_params sim_params.emission_path = './test_time_rollout/' if args.gen_emission \ else None if args.no_render: sim_params.render = False else: sim_params.render = True unwrapped_env.restart_simulation( sim_params=sim_params, render=sim_params.render) # Load data into arrays rew = [] for j in range(args.num_rollouts): # run a single rollout of the experiment path = rollout(env=env, agent=policy) # collect the observations and rewards from the rollout new_rewards = path['rewards'] # print the cumulative reward of the most recent rollout print('Round {}, return: {}'.format(j, sum(new_rewards))) rew.append(sum(new_rewards)) # print the average cumulative reward across rollouts print('Average, std return: {}, {}'.format(np.mean(rew), np.std(rew))) # if prompted, convert the emission file into a csv file if args.gen_emission: dir_path = os.path.dirname(os.path.realpath(__file__)) emission_filename = '{0}-emission.xml'.format( unwrapped_env.scenario.name) emission_path = \ '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename) emission_to_csv(emission_path)
def _worker_collect_one_path(G, max_path_length, include_original_frames, scope=None): G = _get_scoped_G(G, scope) path = rollout(G.env, G.policy, max_path_length, include_original_frames=include_original_frames) return path, len(path["rewards"])
def sample_return(G, params, max_path_length, discount): # env, policy, params, max_path_length, discount = args # of course we make the strong assumption that there is no race condition G.policy.set_param_values(params) path = rollout( G.env, G.policy, max_path_length, ) path["returns"] = discount_cumsum(path["rewards"], discount) path["undiscounted_return"] = sum(path["rewards"]) return path
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker() if created_session: sess.close()
def _worker_rollout_policy(G, args): sample_std = args["sample_std"].flatten() cur_mean = args["cur_mean"].flatten() K = len(cur_mean) params = np.random.standard_normal(K) * sample_std + cur_mean G.policy.set_param_values(params) path = rollout(G.env, G.policy, args["max_path_length"]) path["returns"] = discount_cumsum(path["rewards"], args["discount"]) path["undiscounted_return"] = sum(path["rewards"]) if args["criterion"] == "samples": inc = len(path["rewards"]) elif args["criterion"] == "paths": inc = 1 else: raise NotImplementedError return (params, path), inc
help='Speedup') parser.add_argument('--loop', type=int, default=1, help='# of loops') args = parser.parse_args() policy = None env = None while True: if ':' in args.file: # fetch file using ssh os.system("rsync -avrz %s /tmp/%s.pkl" % (args.file, filename)) data = joblib.load("/tmp/%s.pkl" % filename) if policy: new_policy = data['policy'] policy.set_param_values(new_policy.get_param_values()) path = rollout(env, policy, max_path_length=args.max_length, animated=True, speedup=args.speedup) else: policy = data['policy'] env = data['env'] path = rollout(env, policy, max_path_length=args.max_length, animated=True, speedup=args.speedup) else: data = joblib.load(args.file) policy = data['policy'] env = data['env'] path = rollout(env, policy, max_path_length=args.max_length, animated=True, speedup=args.speedup) # print 'Total reward: ', sum(path["rewards"]) args.loop -= 1 if ':' not in args.file: if args.loop <= 0:
def _worker_collect_one_path(G, max_path_length, scope=None): G = _get_scoped_G(G, scope) path = rollout(G.env, G.policy, max_path_length) return path, len(path["rewards"])
help='Max length of rollout') parser.add_argument('--speedup', type=float, default=1, help='Speedup') parser.add_argument('--video_filename', type=str, help='path to the out video file') parser.add_argument('--prompt', type=bool, default=False, help='Whether or not to prompt for more sim') args = parser.parse_args() max_tries = 10 tri = 0 while True: tri += 1 with tf.Session() as sess: data = joblib.load(args.file) policy = data['policy'] env = data['env'] while True: path = rollout(env, policy, max_path_length=args.max_path_length, animated=True, speedup=args.speedup, video_filename=args.video_filename) if args.prompt: if not query_yes_no('Continue simulation?'): break else: break #import pdb; pdb.set_trace() if len(path['rewards']) < args.max_path_length and tri >= max_tries: tf.reset_default_graph() continue break
def _worker_collect_one_path(G, max_path_length): path = rollout(G.env, G.policy, max_path_length) return path, len(path["rewards"])
def _worker_collect_one_path(G, max_path_length, itr, normalize_reward, reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update): # Path rollout. path = rollout(G.env, G.policy, max_path_length) # Computing intrinsic rewards. # ---------------------------- # Save original reward. path['rewards_orig'] = np.array(path['rewards']) if itr > 0: # Iterate over all paths and compute intrinsic reward by updating the # model on each observation, calculating the KL divergence of the new # params to the old ones, and undoing this operation. obs = (path['observations'] - obs_mean) / (obs_std + 1e-8) act = (path['actions'] - act_mean) / (act_std + 1e-8) rew = path['rewards'] # inputs = (o,a), target = o' obs_nxt = np.vstack([obs[1:]]) _inputs = np.hstack([obs[:-1], act[:-1]]) _targets = obs_nxt # KL vector assumes same shape as reward. kl = np.zeros(rew.shape) for j in xrange(int(np.ceil(obs.shape[0] / float(kl_batch_size)))): # Save old params for every update. G.dynamics.save_old_params() start = j * kl_batch_size end = np.minimum( (j + 1) * kl_batch_size, obs.shape[0] - 1) if second_order_update: # We do a line search over the best step sizes using # step_size * invH * grad # best_loss_value = np.inf for step_size in [0.01]: G.dynamics.save_old_params() loss_value = G.dynamics.train_update_fn( _inputs[start:end], _targets[start:end], step_size) kl_div = np.clip(loss_value, 0, 1000) # If using replay pool, undo updates. if use_replay_pool: G.dynamics.reset_to_old_params() else: # Update model weights based on current minibatch. for _ in xrange(n_itr_update): G.dynamics.train_update_fn( _inputs[start:end], _targets[start:end]) # Calculate current minibatch KL. kl_div = np.clip( float(G.dynamics.f_kl_div_closed_form()), 0, 1000) for k in xrange(start, end): kl[k] = kl_div # If using replay pool, undo updates. if use_replay_pool: G.dynamics.reset_to_old_params() # Last element in KL vector needs to be replaced by second last one # because the actual last observation has no next observation. kl[-1] = kl[-2] # Stuff it in path path['KL'] = kl # ---------------------------- return path, len(path["rewards"])