def train(self): memory = ReplayMem( obs_dim=self.env.observation_space.flat_dim, act_dim=self.env.action_space.flat_dim, memory_size=self.memory_size) itr = 0 path_length = 0 path_return = 0 end = False obs = self.env.reset() for epoch in xrange(self.n_epochs): logger.push_prefix("epoch #%d | " % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # run the policy if end: # reset the environment and stretegy when an episode ends obs = self.env.reset() self.strategy.reset() # self.policy.reset() self.strategy_path_returns.append(path_return) path_length = 0 path_return = 0 # note action is sampled from the policy not the target policy act = self.strategy.get_action(obs, self.policy) nxt, rwd, end, _ = self.env.step(act) path_length += 1 path_return += rwd if not end and path_length >= self.max_path_length: end = True if self.include_horizon_terminal: memory.add_sample(obs, act, rwd, end) else: memory.add_sample(obs, act, rwd, end) obs = nxt if memory.size >= self.memory_start_size: for update_time in xrange(self.n_updates_per_sample): batch = memory.get_batch(self.batch_size) self.do_update(itr, batch) itr += 1 logger.log("Training finished") if memory.size >= self.memory_start_size: self.evaluate(epoch, memory) logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def run_experiment(argv): default_log_dir = config.LOG_DIR now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--n_parallel', type=int, default=1, help='Number of parallel workers to perform rollouts. 0 => don\'t start any workers') parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--log_dir', type=str, default=None, help='Path to save the log and iteration snapshot.') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), "gap" (every' '`snapshot_gap` iterations are saved), or "none" ' '(do not save snapshots)') parser.add_argument('--snapshot_gap', type=int, default=1, help='Gap between snapshot iterations.') parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--variant_log_file', type=str, default='variant.json', help='Name of the variant log file (in json).') parser.add_argument('--resume_from', type=str, default=None, help='Name of the pickle file to resume experiment from.') parser.add_argument('--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--variant_data', type=str, help='Pickled data for variant configuration') parser.add_argument('--use_cloudpickle', type=ast.literal_eval, default=False) args = parser.parse_args(argv[1:]) if args.seed is not None: set_seed(args.seed) if args.n_parallel > 0: from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: parallel_sampler.set_seed(args.seed) if args.plot: from rllab.plotter import plotter plotter.init_worker() if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) if args.variant_data is not None: variant_data = pickle.loads(base64.b64decode(args.variant_data)) variant_log_file = osp.join(log_dir, args.variant_log_file) logger.log_variant(variant_log_file, variant_data) else: variant_data = None if not args.use_cloudpickle: logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_snapshot_gap(args.snapshot_gap) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) if args.resume_from is not None: data = joblib.load(args.resume_from) assert 'algo' in data algo = data['algo'] algo.train() else: # read from stdin if args.use_cloudpickle: import cloudpickle method_call = cloudpickle.loads(base64.b64decode(args.args_data)) method_call(variant_data) else: data = pickle.loads(base64.b64decode(args.args_data)) maybe_iter = concretize(data) if is_iterable(maybe_iter): for _ in maybe_iter: pass logger.set_snapshot_mode(prev_mode) logger.set_snapshot_dir(prev_snapshot_dir) logger.remove_tabular_output(tabular_log_file) logger.remove_text_output(text_log_file) logger.pop_prefix()
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--n_good', type=int, default=3) parser.add_argument('--n_hostage', type=int, default=5) parser.add_argument('--n_bad', type=int, default=5) parser.add_argument('--n_coop_save', type=int, default=2) parser.add_argument('--n_coop_avoid', type=int, default=2) parser.add_argument('--n_sensors', type=int, default=20) parser.add_argument('--sensor_range', type=float, default=0.2) parser.add_argument('--save_reward', type=float, default=3) parser.add_argument('--hit_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.01) parser.add_argument('--bomb_reward', type=float, default=-10.) parser.add_argument('--recurrent', action='store_true', default=False) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) assert sensor_range.shape == (args.n_pursuers,) env = ContinuousHostageWorld(args.n_good, args.n_hostage, args.n_bad, args.n_coop_save, args.n_coop_avoid, n_sensors=args.n_sensors, sensor_range=args.sensor_range, save_reward=args.save_reward, hit_reward=args.hit_reward, encounter_reward=args.encounter_reward, bomb_reward=args.bomb_reward) env = RLLabEnv(StandardizedEnv(env), mode=args.control) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) else: policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(obsfeat_space) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, step_size=args.max_kl, mode=args.control,) algo.train()
def run_experiment(argv): default_log_dir = config.LOG_DIR now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--n_parallel', type=int, default=1, help='Number of parallel workers to perform rollouts.') parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--log_dir', type=str, default=default_log_dir, help='Path to save the log and iteration snapshot.') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') args = parser.parse_args(argv[1:]) from sandbox.vime.sampler import parallel_sampler_expl as parallel_sampler parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) if args.plot: from rllab.plotter import plotter plotter.init_worker() # read from stdin data = pickle.loads(base64.b64decode(args.args_data)) log_dir = args.log_dir # exp_dir = osp.join(log_dir, args.exp_name) tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) maybe_iter = concretize(data) if is_iterable(maybe_iter): for _ in maybe_iter: pass logger.set_snapshot_mode(prev_mode) logger.set_snapshot_dir(prev_snapshot_dir) logger.remove_tabular_output(tabular_log_file) logger.remove_text_output(text_log_file) logger.pop_prefix()
def train(self): parallel_sampler.populate_task(self.env, self.policy) if self.plot: plotter.init_plot(self.env, self.policy) cur_std = self.init_std cur_mean = self.policy.get_param_values() # K = cur_mean.size n_best = max(1, int(self.n_samples * self.best_frac)) for itr in range(self.n_itr): # sample around the current distribution extra_var_mult = max(1.0 - itr / self.extra_decay_time, 0) sample_std = np.sqrt(np.square(cur_std) + np.square(self.extra_std) * extra_var_mult) if self.batch_size is None: criterion = 'paths' threshold = self.n_samples else: criterion = 'samples' threshold = self.batch_size infos = stateful_pool.singleton_pool.run_collect( _worker_rollout_policy, threshold=threshold, args=(dict(cur_mean=cur_mean, sample_std=sample_std, max_path_length=self.max_path_length, discount=self.discount, criterion=criterion),) ) xs = np.asarray([info[0] for info in infos]) paths = [info[1] for info in infos] fs = np.array([path['returns'][0] for path in paths]) print((xs.shape, fs.shape)) best_inds = (-fs).argsort()[:n_best] best_xs = xs[best_inds] cur_mean = best_xs.mean(axis=0) cur_std = best_xs.std(axis=0) best_x = best_xs[0] logger.push_prefix('itr #%d | ' % itr) logger.record_tabular('Iteration', itr) logger.record_tabular('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array([path['undiscounted_return'] for path in paths]) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('StdReturn', np.mean(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) logger.record_tabular('AverageDiscountedReturn', np.mean(fs)) logger.record_tabular('AvgTrajLen', np.mean([len(path['returns']) for path in paths])) logger.record_tabular('NumTrajs', len(paths)) self.policy.set_param_values(best_x) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) logger.save_itr_params(itr, dict( itr=itr, policy=self.policy, env=self.env, cur_mean=cur_mean, cur_std=cur_std, )) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: plotter.update_plot(self.policy, self.max_path_length) parallel_sampler.terminate_task()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) if self.iter_callback is not None: self.iter_callback(locals(), globals()) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) log_dir = os.path.join(default_log_dir, default_exp_name) tabular_log_file = os.path.join(log_dir, 'progress.csv') text_log_file = os.path.join(log_dir, 'debug.log') params_log_file = os.path.join(log_dir, 'params.json') logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode('last') logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % default_exp_name) last_snapshot_dir = '/home/sliay/Documents/rllab/data/local/experiment/experiment_2016_07_07_498itr' data = joblib.load(os.path.join(last_snapshot_dir, 'params.pkl')) policy = data['policy'] env = data['env'] baseline = data['baseline'] # env = normalize(GymEnv("VREP-v0", record_video=False)) # policy = GaussianMLPPolicy( # env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(128, 128) # ) # print('policy initialization') # print(policy.get_param_values())
def _train(self, env, policy, initial_exploration_policy, sub_level_policies_paths, pool, g): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ '''self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy,sub_level_policies, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy,sub_level_policies, pool) initial_exploration_done = False''' with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) #loading-low-level-policies sub_level_policies = [] for p in range(0, len(sub_level_policies_paths)): with tf.variable_scope(str(p), reuse=False): policy_snapshot = joblib.load(sub_level_policies_paths[p]) sub_level_policies.append(policy_snapshot["policy"]) self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, sub_level_policies, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, sub_level_policies, pool) initial_exploration_done = False for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample(initial_exploration_done, g) if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch, initial_exploration_done, sub_level_policies, g) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
log_dir = "./Data/lstm_" + game_name + '_' + str(mask_num) tabular_log_file = osp.join(log_dir, "progress.csv") text_log_file = osp.join(log_dir, "debug.log") params_log_file = osp.join(log_dir, "params.json") pkl_file = osp.join(log_dir, "params.pkl") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode("gaplast") logger.set_snapshot_gap(1000) logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % (game_name + '_' + str(mask_num))) from Algo import parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) policy = CategoricalLSTMPolicy( env_spec=env.spec, name="lstm", ) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: # writer = tf.summary.FileWriter(logdir=log_dir,)
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 self.prev_n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z(next_ob, z, self._num_skills) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # print("\n===RESET", epoch, n_episodes, "===", self._epoch_length, path_length, "===", # # env._wrapped_env.env.nstep_internal, # datetime.datetime.now()) env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 # EPOCH IS DONE epoch if not epoch % 10: logger.log("Epoch: {:4} | Episodes: {}".format( epoch, n_episodes), with_prefix=False) if not n_episodes % self.eval_freq or \ n_episodes >= EPISODE_LIMIT or \ epoch >= self._n_epochs: # is_final = epoch >= self._n_epochs \ # or n_episodes >= EPISODE_LIMIT self.sample_skills_to_bd(n_epoch=epoch, n_episodes=n_episodes) # Make snapshot params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) gt.stamp('behaviours') else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') # Terminate after 1000000 episodes if n_episodes >= EPISODE_LIMIT: break else: continue break if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) logger.push_prefix('Epoch #%d | ' % epoch) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def train(self): # This seems like a rather sequential method pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the # flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values( self.policy.get_param_values()) itr += 1 logger.log("Training finished") if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()
def train(self): # Bayesian neural network (BNN) initialization. # ------------------------------------------------ batch_size = 1 # Redundant n_batches = 5 # Hardcode or annealing scheme \pi_i. # MDP observation and action dimensions. obs_dim = np.prod(self.env.observation_space.shape) act_dim = np.prod(self.env.action_space.shape) logger.log("Building BNN model (eta={}) ...".format(self.eta)) start_time = time.time() self.bnn = bnn.BNN( n_in=(obs_dim + act_dim), n_hidden=self.unn_n_hidden, n_out=obs_dim, n_batches=n_batches, layers_type=self.unn_layers_type, trans_func=lasagne.nonlinearities.rectify, out_func=lasagne.nonlinearities.linear, batch_size=batch_size, n_samples=self.snn_n_samples, prior_sd=self.prior_sd, use_reverse_kl_reg=self.use_reverse_kl_reg, reverse_kl_reg_factor=self.reverse_kl_reg_factor, # stochastic_output=self.stochastic_output, second_order_update=self.second_order_update, learning_rate=self.unn_learning_rate, compression=self.compression, information_gain=self.information_gain) logger.log("Model built ({:.1f} sec).".format( (time.time() - start_time))) if self.use_replay_pool: self.pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_shape=self.env.observation_space.shape, action_dim=act_dim) # ------------------------------------------------ self.start_worker() self.init_opt() episode_rewards = [] episode_lengths = [] for itr in xrange(self.start_itr, self.n_itr): logger.push_prefix('itr #%d | ' % itr) paths = self.obtain_samples(itr) samples_data = self.process_samples(itr, paths) # Exploration code # ---------------- if self.use_replay_pool: # Fill replay pool. logger.log("Fitting dynamics model using replay pool ...") for path in samples_data['paths']: path_len = len(path['rewards']) for i in xrange(path_len): obs = path['observations'][i] act = path['actions'][i] rew = path['rewards'][i] term = (i == path_len - 1) self.pool.add_sample(obs, act, rew, term) # Now we train the dynamics model using the replay self.pool; only # if self.pool is large enough. if self.pool.size >= self.min_pool_size: obs_mean, obs_std, act_mean, act_std = self.pool.mean_obs_act( ) _inputss = [] _targetss = [] for _ in xrange(self.n_updates_per_sample): batch = self.pool.random_batch(self.pool_batch_size) obs = (batch['observations'] - obs_mean) / \ (obs_std + 1e-8) next_obs = (batch['next_observations'] - obs_mean) / (obs_std + 1e-8) act = (batch['actions'] - act_mean) / \ (act_std + 1e-8) _inputs = np.hstack([obs, act]) _targets = next_obs _inputss.append(_inputs) _targetss.append(_targets) old_acc = 0. for _inputs, _targets in zip(_inputss, _targetss): _out = self.bnn.pred_fn(_inputs) old_acc += np.mean(np.square(_out - _targets)) old_acc /= len(_inputss) for _inputs, _targets in zip(_inputss, _targetss): self.bnn.train_fn(_inputs, _targets) new_acc = 0. for _inputs, _targets in zip(_inputss, _targetss): _out = self.bnn.pred_fn(_inputs) new_acc += np.mean(np.square(_out - _targets)) new_acc /= len(_inputss) logger.record_tabular('BNN_DynModelSqLossBefore', old_acc) logger.record_tabular('BNN_DynModelSqLossAfter', new_acc) # ---------------- self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.baseline.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) paths = samples_data["paths"] if self.store_paths: params["paths"] = paths episode_rewards.extend(sum(p["rewards"]) for p in paths) episode_lengths.extend(len(p["rewards"]) for p in paths) params["episode_rewards"] = np.array(episode_rewards) params["episode_lengths"] = np.array(episode_lengths) params["algo"] = self logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: raw_input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) if self.iter_callback is not None: self.iter_callback(locals(), globals()) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(observation) next_ob, reward, terminal, info = env.step(action) path_length += 1 path_return += reward self.pool.add_sample( observation, action, reward, terminal, next_ob, ) if terminal or path_length >= self._max_path_length: observation = env.reset() policy.reset() path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: observation = next_ob gt.stamp('sample') if self.pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self.pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def run_experiment(argv): default_log_dir = config.LOG_DIR now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--n_parallel', type=int, default=1, help='Number of parallel workers to perform rollouts.') parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--log_dir', type=str, default=default_log_dir, help='Path to save the log and iteration snapshot.') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--use_cloudpickle', type=bool, help='NOT USED') args = parser.parse_args(argv[1:]) from sandbox.vime.sampler import parallel_sampler_expl as parallel_sampler parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) if args.plot: from rllab.plotter import plotter plotter.init_worker() # read from stdin data = pickle.loads(base64.b64decode(args.args_data)) log_dir = args.log_dir # exp_dir = osp.join(log_dir, args.exp_name) tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) maybe_iter = concretize(data) if is_iterable(maybe_iter): for _ in maybe_iter: pass logger.set_snapshot_mode(prev_mode) logger.set_snapshot_dir(prev_snapshot_dir) logger.remove_tabular_output(tabular_log_file) logger.remove_text_output(text_log_file) logger.pop_prefix()
plotter.init_worker() tabular_log_file_fullpath = osp.join(log_dir, tabular_log_file) text_log_file_fullpath = osp.join(log_dir, text_log_file) # params_log_file_fullpath = osp.join(log_dir, params_log_file) params_all_log_file_fullpath = osp.join(log_dir, params_all_log_file) # logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file_fullpath) logger.add_tabular_output(tabular_log_file_fullpath) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(snapshot_mode) logger.set_log_tabular_only(log_tabular_only) logger.push_prefix("[%s] " % exp_name) ############################################################ ## Dumping config with open(params_all_log_file_fullpath, 'w') as yaml_file: yaml_file.write(yaml.dump(params, default_flow_style=False)) ############################################################ ## RUNNING THE EXPERIMENT logger.log('Running the experiment ...') if params['use_hide']: if params['use_hide_alg'] == 1: if params['batch_size_uniform'] is not None and params[ 'batch_size_uniform'] > 0: logger.log( 'WARNING: Training with uniform sampling. Testing is done BEFORE the optimization !!!!'
def train(self): # Bayesian neural network (BNN) initialization. # ------------------------------------------------ batch_size = 1 # Redundant n_batches = 5 # Hardcode or annealing scheme \pi_i. # MDP observation and action dimensions. obs_dim = np.prod(self.env.observation_space.shape) act_dim = np.prod(self.env.action_space.shape) logger.log("Building BNN model (eta={}) ...".format(self.eta)) start_time = time.time() self.bnn = bnn.BNN( n_in=(obs_dim + act_dim), n_hidden=self.unn_n_hidden, n_out=obs_dim, n_batches=n_batches, layers_type=self.unn_layers_type, trans_func=lasagne.nonlinearities.rectify, out_func=lasagne.nonlinearities.linear, batch_size=batch_size, n_samples=self.snn_n_samples, prior_sd=self.prior_sd, use_reverse_kl_reg=self.use_reverse_kl_reg, reverse_kl_reg_factor=self.reverse_kl_reg_factor, # stochastic_output=self.stochastic_output, second_order_update=self.second_order_update, learning_rate=self.unn_learning_rate, compression=self.compression, information_gain=self.information_gain ) logger.log( "Model built ({:.1f} sec).".format((time.time() - start_time))) if self.use_replay_pool: self.pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_shape=self.env.observation_space.shape, action_dim=act_dim ) # ------------------------------------------------ self.start_worker() self.init_opt() episode_rewards = [] episode_lengths = [] for itr in xrange(self.start_itr, self.n_itr): logger.push_prefix('itr #%d | ' % itr) paths = self.obtain_samples(itr) samples_data = self.process_samples(itr, paths) # Exploration code # ---------------- if self.use_replay_pool: # Fill replay pool. logger.log("Fitting dynamics model using replay pool ...") for path in samples_data['paths']: path_len = len(path['rewards']) for i in xrange(path_len): obs = path['observations'][i] act = path['actions'][i] rew = path['rewards'][i] term = (i == path_len - 1) self.pool.add_sample(obs, act, rew, term) # Now we train the dynamics model using the replay self.pool; only # if self.pool is large enough. if self.pool.size >= self.min_pool_size: obs_mean, obs_std, act_mean, act_std = self.pool.mean_obs_act() _inputss = [] _targetss = [] for _ in xrange(self.n_updates_per_sample): batch = self.pool.random_batch( self.pool_batch_size) obs = (batch['observations'] - obs_mean) / \ (obs_std + 1e-8) next_obs = ( batch['next_observations'] - obs_mean) / (obs_std + 1e-8) act = (batch['actions'] - act_mean) / \ (act_std + 1e-8) _inputs = np.hstack( [obs, act]) _targets = next_obs _inputss.append(_inputs) _targetss.append(_targets) old_acc = 0. for _inputs, _targets in zip(_inputss, _targetss): _out = self.bnn.pred_fn(_inputs) old_acc += np.mean(np.square(_out - _targets)) old_acc /= len(_inputss) for _inputs, _targets in zip(_inputss, _targetss): self.bnn.train_fn(_inputs, _targets) new_acc = 0. for _inputs, _targets in zip(_inputss, _targetss): _out = self.bnn.pred_fn(_inputs) new_acc += np.mean(np.square(_out - _targets)) new_acc /= len(_inputss) logger.record_tabular( 'BNN_DynModelSqLossBefore', old_acc) logger.record_tabular( 'BNN_DynModelSqLossAfter', new_acc) # ---------------- self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.baseline.log_diagnostics(paths) self.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) paths = samples_data["paths"] if self.store_paths: params["paths"] = paths episode_rewards.extend(sum(p["rewards"]) for p in paths) episode_lengths.extend(len(p["rewards"]) for p in paths) params["episode_rewards"] = np.array(episode_rewards) params["episode_lengths"] = np.array(episode_lengths) params["algo"] = self logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: raw_input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
log_dir = "./Data/obs_1goal20step0stay_1_kdist_01_keep3" tabular_log_file = osp.join(log_dir, "progress.csv") text_log_file = osp.join(log_dir, "debug.log") params_log_file = osp.join(log_dir, "params.json") pkl_file = osp.join(log_dir, "params.pkl") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode("gaplast") logger.set_snapshot_gap(100) logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % "FixMapStartState") from Algo import parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) with tf.Session() as sess: params = joblib.load(log_dir+'/params.pkl') itr=params['itr'] policy=params['policy'] baseline=params['baseline'] env=params['env'] rewards=params['rewards'] algo = VPG_t( env=env,
log_dir = "Data/Ad05RTheta2RSteer2FR" tabular_log_file = osp.join(log_dir, "progress.csv") text_log_file = osp.join(log_dir, "debug.log") params_log_file = osp.join(log_dir, "params.json") pkl_file = osp.join(log_dir, "params.pkl") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode("gaplast") logger.set_snapshot_gap(1000) logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % "RARL-TF-1") import samplers.lowlevel.rarl_parallel_sampler as parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) #env = normalize(MultilaneEnv(),1,True,True,0.001,0.001) #env = normalize(MultilaneEnv()) env = TfEnv(JustEgoEnv(port=9412)) obs1_dim = 4 obs2_dim = 4 action1_dim = 2 action2_dim = 2 spec1 = EnvSpec(
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.99) parser.add_argument('--gae_lambda', type=float, default=1.0) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--rectangle', type=str, default='10,10') parser.add_argument('--map_type', type=str, default='rectangle') parser.add_argument('--n_evaders', type=int, default=5) parser.add_argument('--n_pursuers', type=int, default=2) parser.add_argument('--obs_range', type=int, default=3) parser.add_argument('--n_catch', type=int, default=2) parser.add_argument('--urgency', type=float, default=0.0) parser.add_argument('--pursuit', dest='train_pursuit', action='store_true') parser.add_argument('--evade', dest='train_pursuit', action='store_false') parser.set_defaults(train_pursuit=True) parser.add_argument('--surround', action='store_true', default=False) parser.add_argument('--constraint_window', type=float, default=1.0) parser.add_argument('--sample_maps', action='store_true', default=False) parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy') parser.add_argument('--flatten', action='store_true', default=False) parser.add_argument('--reward_mech', type=str, default='global') parser.add_argument('--catchr', type=float, default=0.1) parser.add_argument('--term_pursuit', type=float, default=5.0) parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--conv', action='store_true', default=False) parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) if args.sample_maps: map_pool = np.load(args.map_file) else: if args.map_type == 'rectangle': env_map = TwoDMaps.rectangle_map(*map(int, args.rectangle.split(','))) elif args.map_type == 'complex': env_map = TwoDMaps.complex_map(*map(int, args.rectangle.split(','))) else: raise NotImplementedError() map_pool = [env_map] env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers, obs_range=args.obs_range, n_catch=args.n_catch, train_pursuit=args.train_pursuit, urgency_reward=args.urgency, surround=args.surround, sample_maps=args.sample_maps, constraint_window=args.constraint_window, flatten=args.flatten, reward_mech=args.reward_mech, catchr=args.catchr, term_pursuit=args.term_pursuit) env = RLLabEnv( StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False), mode=args.control) if args.recurrent: if args.conv: feature_network = ConvNetwork( input_shape=emv.spec.observation_space.shape, output_dim=5, conv_filters=(8,16,16), conv_filter_sizes=(3,3,3), conv_strides=(1,1,1), conv_pads=('VALID','VALID','VALID'), hidden_sizes=(64,), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax) else: feature_network = MLP( input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim,), output_dim=5, hidden_sizes=(128,128,128), hidden_nonlinearity=NL.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden_sizes)) elif args.conv: feature_network = ConvNetwork( input_shape=env.spec.observation_space.shape, output_dim=5, conv_filters=(8,16,16), conv_filter_sizes=(3,3,3), conv_strides=(1,1,1), conv_pads=('valid','valid','valid'), hidden_sizes=(64,), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax) policy = CategoricalMLPPolicy(env_spec=env.spec, prob_network=feature_network) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(obsfeat_space) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, mode=args.control,) algo.train()
log_dir = "Data/Ad05RTheta2RSteer2FR" tabular_log_file = osp.join(log_dir, "progress.csv") text_log_file = osp.join(log_dir, "debug.log") params_log_file = osp.join(log_dir, "params.json") pkl_file = osp.join(log_dir, "params.pkl") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode("gaplast") logger.set_snapshot_gap(1000) logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % "JustEgo_Baseline") from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) env = TfEnv(JustEgoEnv(port=9421)) obs1_dim = 4 action1_dim = 2 policy = GaussianMLPPolicy( env_spec=env.spec, name="BaselinePolicy", learn_std=True, init_std=0.1,
def train(self): pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) #self.experiment_space = self.env.action_space for epoch in xrange(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(xrange(self.epoch_length)): # Execute policy if terminal: observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action, observation) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True if self.include_horizon_terminal_transitions: pool.add_sample( self.env.observation_space.flatten(observation), self.env.action_space.flatten(action), reward * self.scale_reward, terminal ) else: pool.add_sample( self.env.observation_space.flatten(observation), self.env.action_space.flatten(action), reward * self.scale_reward, terminal ) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in xrange(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values(self.policy.get_param_values()) itr += 1 self.pool = pool logger.log("Training finished") if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: raw_input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z(next_ob, z, self._num_skills) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def run_experiment(argv): default_log_dir = config.LOG_DIR now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--n_parallel', type=int, default=1, help='Number of parallel workers to perform rollouts. 0 => don\'t start any workers') parser.add_argument( '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--log_dir', type=str, default=None, help='Path to save the log and iteration snapshot.') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), "gap" (every' '`snapshot_gap` iterations are saved), or "none" ' '(do not save snapshots)') parser.add_argument('--snapshot_gap', type=int, default=1, help='Gap between snapshot iterations.') parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--tabular_log_file2', type=str, default=None, help='Second tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--variant_log_file', type=str, default='variant.json', help='Name of the variant log file (in json).') parser.add_argument('--resume_from', type=str, default=None, help='Name of the pickle file to resume experiment from.') parser.add_argument('--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--variant_data', type=str, help='Pickled data for variant configuration') parser.add_argument('--use_cloudpickle', type=ast.literal_eval, default=False) args = parser.parse_args(argv[1:]) if args.seed is not None: set_seed(args.seed) if args.n_parallel > 0: from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: parallel_sampler.set_seed(args.seed) logger.log("debug1") if args.plot: logger.log("args.plot") from rllab.plotter import plotter logger.log("args.plot") plotter.init_worker() logger.log("debug2") if args.log_dir is None: logger.log("args.logdir1") log_dir = osp.join(default_log_dir, args.exp_name) else: logger.log("args.logdir2") log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) tabular_log_file2 = osp.join(log_dir, args.tabular_log_file2) if args.tabular_log_file2 is not None else osp.join(log_dir,"progress %s.csv" %args.exp_name) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log("debug3") if args.variant_data is not None: variant_data = pickle.loads(base64.b64decode(args.variant_data)) variant_log_file = osp.join(log_dir, args.variant_log_file) logger.log_variant(variant_log_file, variant_data) else: variant_data = None logger.log("debug4") if not args.use_cloudpickle: logger.log_parameters_lite(params_log_file, args) logger.log("debug5") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.add_tabular_output(tabular_log_file2) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_snapshot_gap(args.snapshot_gap) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) logger.log("debug6") if args.resume_from is not None: data = joblib.load(args.resume_from) assert 'algo' in data algo = data['algo'] algo.train() else: # read from stdin logger.log("debug6.5") if args.use_cloudpickle: logger.log("debug6.5.1") import cloudpickle method_call = cloudpickle.loads(base64.b64decode(args.args_data)) method_call(variant_data) else: logger.log("debug6.5.2") data = pickle.loads(base64.b64decode(args.args_data)) logger.log("debug6.5.2.1") logger.log(str(args.args_data)) maybe_iter = concretize(data) logger.log("debug6.5.2.2") if is_iterable(maybe_iter): logger.log("debug6.5.2.3") for _ in maybe_iter: pass logger.log("debug7") logger.set_snapshot_mode(prev_mode) logger.set_snapshot_dir(prev_snapshot_dir) logger.remove_tabular_output(tabular_log_file) logger.remove_tabular_output(tabular_log_file2) logger.remove_text_output(text_log_file) logger.pop_prefix() logger.log("debug8")
env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, )) baseline = LinearFeatureBaseline(env_spec=env.spec) # logger LOG_DIR = 'walker_gru_test' tabular_log_file = osp.join(LOG_DIR, 'progress.csv') text_log_file = osp.join(LOG_DIR, 'debug.log') params_log_file = osp.join(LOG_DIR, 'params.json') logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.set_snapshot_dir(LOG_DIR) logger.set_snapshot_mode('last') logger.set_log_tabular_only(False) logger.push_prefix("[%s] " % 'Walker') algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=1200, max_path_length=500, n_itr=500, discount=0.99, step_size=0.01, mode='centralized') algo.train()
def train(self): # This seems like a rather sequential method pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values(self.policy.get_param_values()) itr += 1 logger.log("Training finished") if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()
def run_task(*_): # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize( GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True)) # env = normalize(GymEnv(env_name="CartPole-v0", force_reset=True)) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64)) # policy = CategoricalMLPPolicy(env.spec, hidden_sizes=(64, 64)) # We will collect 3 trajectories per iteration N = 3 # Each trajectory will have at most 400 time steps T = 400 # Number of iterations n_itr = 1000 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.001 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) returns_var = TT.vector('returns') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and the logarithm of the standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * returns_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, returns_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for epoch in range(n_itr): ################################################################## logger.push_prefix('Epoch #%d | ' % (epoch)) logger.log("Training started") ################################################################## paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory (return to go) returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) # The returns are stored backwards in time, so we need to revert it returns = returns[::-1] paths.append( dict(observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), returns=np.array(returns))) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) returns = np.concatenate([p["returns"] for p in paths]) f_train(observations, actions, returns) print('Average Return:', np.mean([sum(p["rewards"]) for p in paths])) ############################################################################ logger.log("Training finished") logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() logger.record_tabular('Epoch', epoch) logger.record_tabular('Steps', epoch * N * T) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns))
def setup(self, env, policy, start_itr): if not self.args.algo == 'thddpg': # Baseline if self.args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif self.args.baseline_type == 'zero': baseline = ZeroBaseline(env_spec=env.spec) else: raise NotImplementedError(self.args.baseline_type) if self.args.control == 'concurrent': baseline = [baseline for _ in range(len(env.agents))] # Logger default_log_dir = config.LOG_DIR if self.args.log_dir is None: log_dir = osp.join(default_log_dir, self.args.exp_name) else: log_dir = self.args.log_dir tabular_log_file = osp.join(log_dir, self.args.tabular_log_file) text_log_file = osp.join(log_dir, self.args.text_log_file) params_log_file = osp.join(log_dir, self.args.params_log_file) logger.log_parameters_lite(params_log_file, self.args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(self.args.snapshot_mode) logger.set_log_tabular_only(self.args.log_tabular_only) logger.push_prefix("[%s] " % self.args.exp_name) if self.args.algo == 'tftrpo': algo = MATRPO(env=env, policy_or_policies=policy, baseline_or_baselines=baseline, batch_size=self.args.batch_size, start_itr=start_itr, max_path_length=self.args.max_path_length, n_itr=self.args.n_iter, discount=self.args.discount, gae_lambda=self.args.gae_lambda, step_size=self.args.step_size, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) if self.args.recurrent else None, ma_mode=self.args.control) elif self.args.algo == 'thddpg': qfunc = thContinuousMLPQFunction(env_spec=env.spec) if self.args.exp_strategy == 'ou': es = OUStrategy(env_spec=env.spec) elif self.args.exp_strategy == 'gauss': es = GaussianStrategy(env_spec=env.spec) else: raise NotImplementedError() algo = thDDPG(env=env, policy=policy, qf=qfunc, es=es, batch_size=self.args.batch_size, max_path_length=self.args.max_path_length, epoch_length=self.args.epoch_length, min_pool_size=self.args.min_pool_size, replay_pool_size=self.args.replay_pool_size, n_epochs=self.args.n_iter, discount=self.args.discount, scale_reward=0.01, qf_learning_rate=self.args.qfunc_lr, policy_learning_rate=self.args.policy_lr, eval_samples=self.args.eval_samples, mode=self.args.control) return algo
def train(self): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # This seems like a rather sequential method pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, replacement_prob=self.replacement_prob, ) self.start_worker() self.init_opt() # This initializes the optimizer parameters sess.run(tf.global_variables_initializer()) itr = 0 path_length = 0 path_return = 0 terminal = False initial = False observation = self.env.reset() with tf.variable_scope("sample_policy"): sample_policy = Serializable.clone(self.policy) for epoch in range(self.n_epochs): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") train_qf_itr, train_policy_itr = 0, 0 # updated_q_network, updated_policy_network, _, _, end_trajectory_action, end_trajectory_state = self.lp.lp_exploration() # Don't need to set the values because we're actually using the same policy/qf already # self.qf.set_param_values(updated_q_network.get_param_values()) # self.policy.set_param_values(updated_policy_network.get_param_values()) # observation = end_trajectory_state for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 initial = True else: initial = False action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # Train policy batch = pool.random_batch(self.batch_size) itrs = self.do_training(itr, batch) train_qf_itr += itrs[0] train_policy_itr += itrs[1] sample_policy.set_param_values(self.policy.get_param_values()) itr += 1 logger.log("Training finished") logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()
def train(self): cur_std = self.sigma0 cur_mean = self.policy.get_param_values() es = cma_es_lib.CMAEvolutionStrategy( cur_mean, cur_std) parallel_sampler.populate_task(self.env, self.policy) if self.plot: plotter.init_plot(self.env, self.policy) cur_std = self.sigma0 cur_mean = self.policy.get_param_values() itr = 0 while itr < self.n_itr and not es.stop(): if self.batch_size is None: # Sample from multivariate normal distribution. xs = es.ask() xs = np.asarray(xs) # For each sample, do a rollout. infos = ( stateful_pool.singleton_pool.run_map(sample_return, [(x, self.max_path_length, self.discount) for x in xs])) else: cum_len = 0 infos = [] xss = [] done = False while not done: sbs = stateful_pool.singleton_pool.n_parallel * 2 # Sample from multivariate normal distribution. # You want to ask for sbs samples here. xs = es.ask(sbs) xs = np.asarray(xs) xss.append(xs) sinfos = stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs]) for info in sinfos: infos.append(info) cum_len += len(info['returns']) if cum_len >= self.batch_size: xs = np.concatenate(xss) done = True break # Evaluate fitness of samples (negative as it is minimization # problem). fs = - np.array([info['returns'][0] for info in infos]) # When batching, you could have generated too many samples compared # to the actual evaluations. So we cut it off in this case. xs = xs[:len(fs)] # Update CMA-ES params based on sample fitness. es.tell(xs, fs) logger.push_prefix('itr #%d | ' % itr) logger.record_tabular('Iteration', itr) logger.record_tabular('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [info['undiscounted_return'] for info in infos]) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('StdReturn', np.mean(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) logger.record_tabular('AverageDiscountedReturn', np.mean(fs)) logger.record_tabular('AvgTrajLen', np.mean([len(info['returns']) for info in infos])) self.env.log_diagnostics(infos) self.policy.log_diagnostics(infos) logger.save_itr_params(itr, dict( itr=itr, policy=self.policy, env=self.env, )) logger.dump_tabular(with_prefix=False) if self.plot: plotter.update_plot(self.policy, self.max_path_length) logger.pop_prefix() # Update iteration. itr += 1 # Set final params. self.policy.set_param_values(es.result()[0]) parallel_sampler.terminate_task()
def setup(self, env, policy, start_itr): if not self.args.algo == 'thddpg': # Baseline if self.args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif self.args.baseline_type == 'zero': baseline = ZeroBaseline(env_spec=env.spec) else: raise NotImplementedError(self.args.baseline_type) if self.args.control == 'concurrent': baseline = [baseline for _ in range(len(env.agents))] # Logger default_log_dir = config.LOG_DIR if self.args.log_dir is None: log_dir = osp.join(default_log_dir, self.args.exp_name) else: log_dir = self.args.log_dir tabular_log_file = osp.join(log_dir, self.args.tabular_log_file) text_log_file = osp.join(log_dir, self.args.text_log_file) params_log_file = osp.join(log_dir, self.args.params_log_file) logger.log_parameters_lite(params_log_file, self.args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(self.args.snapshot_mode) logger.set_log_tabular_only(self.args.log_tabular_only) logger.push_prefix("[%s] " % self.args.exp_name) if self.args.algo == 'tftrpo': algo = MATRPO(env=env, policy_or_policies=policy, baseline_or_baselines=baseline, batch_size=self.args.batch_size, start_itr=start_itr, max_path_length=self.args.max_path_length, n_itr=self.args.n_iter, discount=self.args.discount, gae_lambda=self.args.gae_lambda, step_size=self.args.step_size, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) if self.args.recurrent else None, ma_mode=self.args.control) elif self.args.algo == 'thddpg': qfunc = thContinuousMLPQFunction(env_spec=env.spec) if self.args.exp_strategy == 'ou': es = OUStrategy(env_spec=env.spec) elif self.args.exp_strategy == 'gauss': es = GaussianStrategy(env_spec=env.spec) else: raise NotImplementedError() algo = thDDPG(env=env, policy=policy, qf=qfunc, es=es, batch_size=self.args.batch_size, max_path_length=self.args.max_path_length, epoch_length=self.args.epoch_length, min_pool_size=self.args.min_pool_size, replay_pool_size=self.args.replay_pool_size, n_epochs=self.args.n_iter, discount=self.args.discount, scale_reward=0.01, qf_learning_rate=self.args.qfunc_lr, policy_learning_rate=self.args.policy_lr, eval_samples=self.args.eval_samples, mode=self.args.control) return algo
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--enable_obsnorm', action='store_true', default=False) parser.add_argument('--chunked', action='store_true', default=False) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--anneal_step_size', type=int, default=0) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--radius', type=float, default=0.015) parser.add_argument('--n_evaders', type=int, default=10) parser.add_argument('--n_pursuers', type=int, default=8) parser.add_argument('--n_poison', type=int, default=10) parser.add_argument('--n_coop', type=int, default=4) parser.add_argument('--n_sensors', type=int, default=30) parser.add_argument('--sensor_range', type=str, default='0.2') parser.add_argument('--food_reward', type=float, default=5) parser.add_argument('--poison_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.05) parser.add_argument('--reward_mech', type=str, default='local') parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) if len(sensor_range) == 1: sensor_range = sensor_range[0] else: assert sensor_range.shape == (args.n_pursuers,) env = MAWaterWorld(args.n_pursuers, args.n_evaders, args.n_coop, args.n_poison, radius=args.radius, n_sensors=args.n_sensors, food_reward=args.food_reward, poison_reward=args.poison_reward, encounter_reward=args.encounter_reward, reward_mech=args.reward_mech, sensor_range=sensor_range, obstacle_loc=None) env = TfEnv( RLLabEnv( StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=args.enable_obsnorm), mode=args.control)) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim,), output_dim=16, hidden_sizes=(128, 64, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden_sizes), name='policy') else: policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))), min_std=10e-5) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif args.baseline_type == 'mlp': raise NotImplementedError() # baseline = GaussianMLPBaseline( # env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(',')))) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, #max_path_length_limit=args.max_path_length_limit, update_max_path_length=args.update_curriculum, anneal_step_size=args.anneal_step_size, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) if args.recurrent else None, mode=args.control if not args.chunked else 'chunk_{}'.format(args.control),) algo.train()
def train(self): pool = SimpleReplayPool( max_pool_size=self.replay_pool_size, observation_dim=self.env.observation_space.flat_dim, action_dim=self.env.action_space.flat_dim, ) self.start_worker() self.init_opt() itr = 0 path_length = 0 path_return = 0 terminal = False observation = self.env.reset() sample_policy = pickle.loads(pickle.dumps(self.policy)) """ Loop for the number of training epochs """ for epoch in range(self.n_epochs): print("Number of Training Epochs", epoch) logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") print("Q Network Weights before LP", self.qf.get_param_values(regularizable=True)) updated_q_network, updated_policy_network, _, _, end_trajectory_action, end_trajectory_state = self.lp.lp_exploration( ) self.qf = updated_q_network self.policy = updated_policy_network observation = end_trajectory_state for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): # Execute policy if terminal: # or path_length > self.max_path_length: # Note that if the last time step ends an episode, the very # last state and observation will be ignored and not added # to the replay pool observation = self.env.reset() self.es.reset() sample_policy.reset() self.es_path_returns.append(path_return) path_length = 0 path_return = 0 else: initial = False action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) #next states, and reward based on the chosen action next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 #accumulation of total rewards path_return += reward if not terminal and path_length >= self.max_path_length: terminal = True # only include the terminal transition in this case if the flag was set if self.include_horizon_terminal_transitions: pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) else: pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) observation = next_observation if pool.size >= self.min_pool_size: for update_itr in range(self.n_updates_per_sample): # print ("Replay Buffer, iterations", update_itr) # Train policy batch = pool.random_batch(self.batch_size) self.do_training(itr, batch) sample_policy.set_param_values( self.policy.get_param_values()) itr += 1 logger.log("Training finished") print("DDPG, Updated Q Network Weights", self.qf.get_param_values(regularizable=True)) if pool.size >= self.min_pool_size: self.evaluate(epoch, pool) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.env.terminate() self.policy.terminate()