Esempio n. 1
0
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths
     # we need to log the purely gather reward!!
     with logger.tabular_prefix(log_prefix + '_'):
         gather_undiscounted_returns = [
             sum(path['env_infos']['outer_rew']) for path in paths
         ]
         logger.record_tabular_misc_stat('Return',
                                         gather_undiscounted_returns,
                                         placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = \
             stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
         #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         if 'env_infos' in paths[0].keys(
         ) and 'inner_rew' in paths[0]['env_infos'].keys():
             wrapped_undiscounted_return = np.mean(
                 [np.sum(path['env_infos']['inner_rew']) for path in paths])
             logger.record_tabular('AverageReturn',
                                   wrapped_undiscounted_return)
         self.wrapped_env.log_diagnostics(
             stripped_paths
         )  # see swimmer_env.py for a scketch of the maze plotting!
    def log_diagnostics(self, paths):
        BatchPolopt.log_diagnostics(self, paths)
        self.sampler.log_diagnostics(paths)

        if self.policy.latent_dim:

            if self.log_individual_latents and not self.policy.resample:  # this is only valid for finite discrete latents!!
                all_latent_avg_returns = []
                clustered_by_latents = collections.OrderedDict()  # this could be done within the distribution to be more general, but ugly
                for lat_key in range(self.policy.latent_dim):
                    clustered_by_latents[lat_key] = []
                for path in paths:
                    lat = path['agent_infos']['latents'][0]
                    lat_key = int(from_onehot(lat))  # from_onehot returns an axis less than the input.
                    clustered_by_latents[lat_key].append(path)

                for latent_key, paths in clustered_by_latents.items():  # what to do if this is empty?? set a default!
                    with logger.tabular_prefix(str(latent_key)), logger.prefix(str(latent_key)):
                        if paths:
                            undiscounted_rewards = [sum(path["true_rewards"]) for path in paths]
                        else:
                            undiscounted_rewards = [0]
                        all_latent_avg_returns.append(np.mean(undiscounted_rewards))
                        logger.record_tabular('Avg_TrueReturn', np.mean(undiscounted_rewards))
                        logger.record_tabular('Std_TrueReturn', np.std(undiscounted_rewards))
                        logger.record_tabular('Max_TrueReturn', np.max(undiscounted_rewards))
                        if self.log_deterministic:
                            lat = from_index(latent_key, self.policy.latent_dim)
                            with self.policy.fix_latent(lat), self.policy.set_std_to_0():
                                path_det = rollout(self.env, self.policy, self.max_path_length)
                                logger.record_tabular('Deterministic_TrueReturn', np.sum(path_det["rewards"]))

                with logger.tabular_prefix('all_lat_'), logger.prefix('all_lat_'):
                    logger.record_tabular('MaxAvgReturn', np.max(all_latent_avg_returns))
                    logger.record_tabular('MinAvgReturn', np.min(all_latent_avg_returns))
                    logger.record_tabular('StdAvgReturn', np.std(all_latent_avg_returns))

                if self.log_hierarchy:
                    max_in_path_length = 10
                    completed_in_paths = 0
                    path = rollout(self.env, self.policy, max_path_length=max_in_path_length, animated=False)
                    if len(path['rewards']) == max_in_path_length:
                        completed_in_paths += 1
                        for t in range(1, 50):
                            path = rollout(self.env, self.policy, max_path_length=10, animated=False,
                                           reset_start_rollout=False)
                            if len(path['rewards']) < 10:
                                break
                            completed_in_paths += 1
                    logger.record_tabular('Hierarchy', completed_in_paths)

        else:
            if self.log_deterministic:
                with self.policy.set_std_to_0():
                    path = rollout(self.env, self.policy, self.max_path_length)
                logger.record_tabular('Deterministic_TrueReturn', np.sum(path["rewards"]))
    def evaluate_performance(env):
        four_rooms = np.array([[-2, -2], [-13, -13]])
        if v['unif_starts']:
            mean_rewards, successes = [], []
            for pos in four_rooms:
                env.update_start_generator(FixedStateGenerator(np.array(pos)))
                mr, scs = test_and_plot_policy(policy, env, horizon=v['horizon'],  max_reward=v['max_reward'], sampling_res=sampling_res,
                                               n_traj=v['n_traj'],
                                               itr=outer_iter, report=report, limit=v['goal_range'],
                                               center=v['goal_center'], using_gym=True,
                                               noise=v['action_noise'], n_processes=8, log=False)
                mean_rewards.append(mr)
                successes.append(scs)
            with logger.tabular_prefix('Outer_'):
                logger.record_tabular('iter', outer_iter)
                logger.record_tabular('MeanRewards', np.mean(mean_rewards))
                logger.record_tabular('Success', np.mean(successes))
        else:
            env.update_start_generator(FixedStateGenerator(np.array([0, 0])))
            _, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res,
                                          n_traj=v['n_traj'],
                                          itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                                          using_gym=True,
                                          noise=v['action_noise'], n_processes=8)

        report.new_row()

        env.update_start_generator(uniform_start_generator)

        return scs
Esempio n. 4
0
def test_and_plot_policy(policy,
                         env,
                         as_goals=True,
                         visualize=True,
                         sampling_res=1,
                         n_traj=1,
                         max_reward=1,
                         itr=0,
                         report=None,
                         center=None,
                         limit=None,
                         bounds=None):

    avg_totRewards, avg_success, states, spacing, avg_time = test_policy(
        policy,
        env,
        as_goals,
        visualize,
        center=center,
        sampling_res=sampling_res,
        n_traj=n_traj,
        bounds=bounds)
    obj = env
    while not hasattr(obj, '_maze_id') and hasattr(obj, 'wrapped_env'):
        obj = obj.wrapped_env
    maze_id = obj._maze_id if hasattr(obj, '_maze_id') else None
    plot_heatmap(avg_success,
                 states,
                 spacing=spacing,
                 show_heatmap=False,
                 maze_id=maze_id,
                 center=center,
                 limit=limit)
    reward_img = save_image()

    # plot_heatmap(avg_time, states, spacing=spacing, show_heatmap=False, maze_id=maze_id,
    #              center=center, limit=limit, adaptive_range=True)
    # time_img = save_image()

    mean_rewards = np.mean(avg_totRewards)
    success = np.mean(avg_success)

    with logger.tabular_prefix('Outer_'):
        logger.record_tabular('iter', itr)
        logger.record_tabular('MeanRewards', mean_rewards)
        logger.record_tabular('Success', success)
    # logger.dump_tabular(with_prefix=False)

    if report is not None:
        report.add_image(
            reward_img,
            'policy performance\n itr: {} \nmean_rewards: {} \nsuccess: {}'.
            format(itr, mean_rewards, success))
        # report.add_image(
        #     time_img,
        #     'policy time\n itr: {} \n'.format(
        #         itr
        #     )
        # )
    return mean_rewards, success
Esempio n. 5
0
 def log_diagnostics(self, all_paths):
     for n, (env, policy, baseline, paths) in enumerate(
             zip(self.env_partitions, self.local_policies,
                 self.local_baselines, all_paths)):
         with logger.tabular_prefix(str(n)):
             env.log_diagnostics(paths)
             policy.log_diagnostics(paths)
             baseline.log_diagnostics(paths)
Esempio n. 6
0
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths
     # we need to log the purely gather reward!!
     with logger.tabular_prefix(log_prefix + '_'):
         gather_undiscounted_returns = [sum(path['env_infos']['outer_rew']) for path in paths]
         logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = \
             stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
         #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         if 'env_infos' in paths[0].keys() and 'inner_rew' in paths[0]['env_infos'].keys():
             wrapped_undiscounted_return = np.mean([np.sum(path['env_infos']['inner_rew']) for path in paths])
             logger.record_tabular('AverageReturn', wrapped_undiscounted_return)
         self.wrapped_env.log_diagnostics(stripped_paths)  # see swimmer_env.py for a scketch of the maze plotting!
Esempio n. 7
0
File: diayn.py Progetto: sumitsk/sac
    def _evaluate(self, epoch):
        """Perform evaluation for the current policy.

        We always use the most recent policy, but for computational efficiency
        we sometimes use a stale version of the metapolicy.
        During evaluation, our policy expects an un-augmented observation.

        :param epoch: The epoch number.
        :return: None
        """

        if self._eval_n_episodes < 1:
            return

        if epoch % self._find_best_skill_interval == 0:
            self._single_option_policy = self._get_best_single_option_policy()
        for (policy, policy_name) in [(self._single_option_policy,
                                       'best_single_option_policy')]:
            with logger.tabular_prefix(policy_name +
                                       '/'), logger.prefix(policy_name + '/'):
                with self._policy.deterministic(self._eval_deterministic):
                    if self._eval_render:
                        paths = rollouts(self._eval_env,
                                         policy,
                                         self._max_path_length,
                                         self._eval_n_episodes,
                                         render=True,
                                         render_mode='rgb_array')
                    else:
                        paths = rollouts(self._eval_env, policy,
                                         self._max_path_length,
                                         self._eval_n_episodes)

                total_returns = [path['rewards'].sum() for path in paths]
                episode_lengths = [len(p['rewards']) for p in paths]

                logger.record_tabular('return-average', np.mean(total_returns))
                logger.record_tabular('return-min', np.min(total_returns))
                logger.record_tabular('return-max', np.max(total_returns))
                logger.record_tabular('return-std', np.std(total_returns))
                logger.record_tabular('episode-length-avg',
                                      np.mean(episode_lengths))
                logger.record_tabular('episode-length-min',
                                      np.min(episode_lengths))
                logger.record_tabular('episode-length-max',
                                      np.max(episode_lengths))
                logger.record_tabular('episode-length-std',
                                      np.std(episode_lengths))

                self._eval_env.log_diagnostics(paths)

        batch = self._pool.random_batch(self._batch_size)
        self.log_diagnostics(batch)
    def log_diagnostics(self, paths, prefix=''):
        progs = [
            np.linalg.norm(path["env_infos"]['com'][-1] -
                           path["env_infos"]['com'][0]) for path in paths
        ]
        with logger.tabular_prefix(prefix):
            logger.record_tabular('AverageForwardProgress', np.mean(progs))
            logger.record_tabular('MaxForwardProgress', np.max(progs))
            logger.record_tabular('MinForwardProgress', np.min(progs))
            logger.record_tabular('StdForwardProgress', np.std(progs))

        self.plot_visitations(paths, prefix=prefix)
    def log_diagnostics(self, paths, *args, **kwargs):
        # we call here any logging related to the maze, strip the maze obs and call log_diag with the stripped paths
        # we need to log the purely gather reward!!
        with logger.tabular_prefix('Maze_'):
            gather_undiscounted_returns = [
                sum(path['env_infos']['outer_rew']) for path in paths
            ]
            logger.record_tabular_misc_stat('Return',
                                            gather_undiscounted_returns,
                                            placement='front')
        stripped_paths = []
        for path in paths:
            stripped_path = {}
            for k, v in path.items():
                # print("k", k)
                stripped_path[k] = v
            # for k, v in path["agent_infos"].items():
            #     print("k", k)
            # print("latents", stripped_path["agent_infos"]["latents"])
            # print("latents", stripped_path["agent_infos"]["latents"].shape)
            # print("shape_len", len(stripped_path['observations'].shape))
            # print("after_con", np.concatenate(stripped_path['observations']).shape)

            if len(stripped_path['observations'].shape) == 1:
                stripped_path['observations'] = np.concatenate(
                    stripped_path['observations'])

            stripped_path['observations'] = \
                stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
            #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
            stripped_paths.append(stripped_path)
        with logger.tabular_prefix('wrapped_'):
            wrapped_undiscounted_return = np.mean(
                [np.sum(path['env_infos']['inner_rew']) for path in paths])
            # for _ in range(10):
            #     print('OK!')
            # print(wrapped_undiscounted_return)
            # print([np.sum(path['env_infos']['inner_rew']) for path in paths])
            logger.record_tabular('SuccessRate', wrapped_undiscounted_return)
            self.wrapped_env.log_diagnostics(stripped_paths, *args, **kwargs)
Esempio n. 10
0
    def train(self, sess=None):
        if sess is None:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            sess.__enter__()
            sess.run(tf.initialize_all_variables())
        else:
            sess.run(
                tf.initialize_variables(
                    list(
                        tf.get_variable(name) for name in sess.run(
                            tf.report_uninitialized_variables()))))

        self.start_worker()
        start_time = time.time()

        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()

            with logger.prefix('itr #%d | ' % itr):
                all_paths = []
                logger.log("Obtaining samples...")
                for sampler in self.local_samplers:
                    all_paths.append(sampler.obtain_samples(itr))

                logger.log("Processing samples...")
                all_samples_data = []
                for n, (sampler,
                        paths) in enumerate(zip(self.local_samplers,
                                                all_paths)):
                    with logger.tabular_prefix(str(n)):
                        all_samples_data.append(
                            sampler.process_samples(itr, paths))

                logger.log("Logging diagnostics...")
                self.log_diagnostics(all_paths, )

                logger.log("Optimizing policy...")
                self.optimize_policy(itr, all_samples_data)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               all_samples_data)  # , **kwargs)
                logger.save_itr_params(itr, params)

                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)

        self.shutdown_worker()
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)

                if not self.test_env is None:
                    logger.log("Obtaining test samples...")
                    test_paths = self.test_sampler.obtain_samples(itr)
                    with logger.tabular_prefix("Test"):
                        self.test_sampler.process_samples(itr, test_paths)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
        self.shutdown_worker()
        if created_session:
            sess.close()
    def trainExperts(self, num_training_itrs):

        for itr in range(num_training_itrs):
            print('############itr_' + str(itr) + '################')
            all_paths = []

            for sampler in self.local_samplers:
                all_paths.append(sampler.obtain_samples(itr))

            #if itr == (num_training_itrs-1) or itr == 0:
            log = True
            #else:
            #log = False
            all_samples_data = []
            for n, (sampler,
                    paths) in enumerate(zip(self.local_samplers, all_paths)):
                with logger.tabular_prefix(str(n)):
                    all_samples_data.append(
                        sampler.process_samples(itr, paths, log=log))

            logger.log("Logging diagnostics...")
            self.log_diagnostics(all_paths, prefix='')

            logger.log("Optimizing policy...")
            self.optimize_expert_policies(itr, all_samples_data)

            # logger.log("Saving snapshot...")
            # params = self.get_itr_snapshot(itr, all_samples_data)  # , **kwargs)
            # logger.save_itr_params(itr, params)

            # logger.log("Saved")
            # logger.record_tabular('Time', time.time() - start_time)
            # logger.record_tabular('ItrTime', time.time() - itr_start_time)
            logger.dump_tabular(with_prefix=False)

        for t in range(len(all_paths)):
            for path in all_paths[t]:

                path['expert_actions'] = np.clip(deepcopy(path['actions']),
                                                 -1.0, 1.0)
                path['agent_infos'] = dict(
                    mean=[[0.0] * len(path['actions'][0])] *
                    len(path['actions']),
                    log_std=[[0.0] * len(path['actions'][0])] *
                    len(path['actions']))

        expertDict = {i: all_paths[i] for i in range(len(all_paths))}
        return expertDict
Esempio n. 13
0
    def evaluate_performance_plane(test_env):
        epss = [0.1, 0.2, 0.3, 0.5, 0.7]

        labels, paths = label_states(goals,
                                     test_env,
                                     policy,
                                     v['horizon'],
                                     n_traj=v['n_traj'],
                                     key='goal_reached',
                                     n_processes=8,
                                     using_gym=True,
                                     noise=0,
                                     full_path=True)

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('iter', outer_iter)
            for eps in epss:
                successes = np.mean(goal_reached_by_eps(paths, eps))
                logger.record_tabular('Success_%3.1f' % eps, successes)
        return np.mean(successes)
Esempio n. 14
0
    def fit_with_samples(self, paths, samples_data):
        inputs = [
            samples_data["observations"], samples_data["returns"],
            samples_data["valids"]
        ]

        self.f_update_stats(samples_data["returns"], samples_data["valids"])

        with logger.prefix("Vf | "), logger.tabular_prefix("Vf."):
            if self.log_loss_before:
                logger.log("Computing loss before training")
                loss_before, _ = self.optimizer.loss_diagnostics(inputs)
                logger.log("Computed")

            epoch_losses = []

            def record_data(loss, diagnostics, *args, **kwargs):
                epoch_losses.append(loss)
                return True

            self.optimizer.optimize(inputs, callback=record_data)

            if self.log_loss_after:
                logger.log("Computing loss after training")
                loss_after, _ = self.optimizer.loss_diagnostics(inputs)
                logger.log("Computed")

            # perform minibatch gradient descent on the surrogate loss, while monitoring the KL divergence

            if self.log_loss_before:
                logger.record_tabular('LossBefore', loss_before)
            else:
                # Log approximately
                logger.record_tabular('FirstEpoch.Loss', epoch_losses[0])
            if self.log_loss_after:
                logger.record_tabular('LossAfter', loss_after)
            else:
                logger.record_tabular('LastEpoch.Loss', epoch_losses[-1])
            if self.log_loss_before and self.log_loss_after:
                logger.record_tabular('dLoss', loss_before - loss_after)
Esempio n. 15
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))
    report.save()

    inner_env = normalize(Arm3dDiscEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'disc_all_feasible_states_min.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    # brownian_starts = StateCollection(distance_threshold=v['regularize_starts'])
    # with env.set_kill_outside():
    #     seed_starts = generate_starts(env, starts=[v['start_goal']], horizon=10,  # this is smaller as they are seeds!
    #                                   variance=v['brownian_variance'], subsample=v['num_new_starts'])  # , animated=True, speedup=1)
    #
    # with env.set_kill_outside():
    #     find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10)

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    env_alice = AliceEnv(env, env, policy, v['horizon'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        # with env.set_kill_outside():
        #     starts = generate_starts(env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance'])

        # regularization of the brownian starts
        # brownian_starts.empty()
        # brownian_starts.append(starts)
        # starts = brownian_starts.sample(size=v['num_new_starts'])

        starts = generate_starts_alice(env_bob=env,
                                       env_alice=env_alice,
                                       policy_bob=policy,
                                       policy_alice=policy_alice,
                                       algo_alice=algo_alice,
                                       start_states=[v['start_goal']],
                                       num_new_starts=v['num_new_starts'],
                                       alice_factor=v['alice_factor'],
                                       log_dir=log_dir)

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        logger.record_tabular('starts', starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(1000)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        logger.log("Appending good goals to replay and generating seeds")
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_starts)
Esempio n. 16
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(AntMazeEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env, goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                         itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['goal_size'],
        evaluater_size=v['num_labels'],
        state_range=v['goal_range'],
        state_center=v['goal_center'],
        state_noise_level=v['goal_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )
    logger.log("pretraining the GAN...")
    if v['smart_init']:
        feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                horizon=v['horizon'])
        labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter,
                            limit=v['goal_range'], center=v['goal_center'])

        dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=v['gan_outer_iters'])
        print("Loss of Gen and Dis: ", gen_loss, dis_loss)
    else:
        gan.pretrain_uniform()

    # log first samples form the GAN
    initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

    logger.log("Labeling the goals")
    labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

    plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter,
                        limit=v['goal_range'], center=v['goal_center'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        # Sample GAN
        logger.log("Sampling goals from the GAN")
        raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(v['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals

        # if needed label the goals before any update
        if v['label_with_variation']:
            old_labels, old_rewards = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                                   key='goal_reached', full_path=False, return_rew=True)

        # itr_label = outer_iter  # use outer_iter to log everything or "last" to log only the last
        # with ExperimentLogger(log_dir, itr_label, snapshot_mode='last', hold_outter_log=True):
        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [goals, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
                                                       as_goal=True, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        elif v['label_with_variation']:
            labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                         key='goal_reached', old_rewards=old_rewards, full_path=True)
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'],
                                         key='goal_reached', full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                             itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'])

        #logger.log("Labeling the goals")
        #labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        if v['label_with_variation']:  # this will use only the performance variation for labeling
            labels = np.array(labels[:, -1], dtype=int).reshape((-1, 1))
        else:
            labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Training the GAN")
        gan.train(
            goals, labels,
            v['gan_outer_iters'],
        )

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1]
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                    horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
Esempio n. 17
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']
    unif_samples = 300

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])
    total_rollouts = 0

    for outer_iter in range(1, v['outer_iters']):
        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling goals")

        goals = np.array([]).reshape((-1, v['goal_size']))
        k = 0
        while goals.shape[0] < v['num_new_goals']:
            print('good goals collected: ', goals.shape[0])
            logger.log("Sampling and labeling the goals: %d" % k)
            k += 1
            unif_goals = np.random.uniform(
                np.array(v['goal_center']) - np.array(v['goal_range']),
                np.array(v['goal_center']) + np.array(v['goal_range']),
                size=(unif_samples, v['goal_size']))
            labels = label_states(unif_goals,
                                  env,
                                  policy,
                                  v['horizon'],
                                  n_traj=v['n_traj'],
                                  key='goal_reached')
            logger.log("Converting the labels")
            init_classes, text_labels = convert_label(labels)
            goals = np.concatenate([goals,
                                    unif_goals[init_classes == 2]]).reshape(
                                        (-1, v['goal_size']))

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(
                v['num_old_goals'])  #todo: replay noise?
            goals = np.vstack([goals, old_goals])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        logger.log("labeling starts with trpo rollouts")
        [goals, labels] = label_states_from_paths(
            trpo_paths,
            n_traj=2,
            key='goal_reached',  # using the min n_traj
            as_goal=True,
            env=env)
        paths = [path for paths in trpo_paths for path in paths]
        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # rollouts used for labeling (before TRPO itrs):
        logger.record_tabular('LabelingRollouts',
                              k * v['n_traj'] * unif_samples)
        total_rollouts += k * v['n_traj'] * unif_samples
        logger.record_tabular('TotalLabelingRollouts', total_rollouts)

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]
        all_goals.append(filtered_raw_goals)
    def plot_visitations(self,
                         paths,
                         mesh_density=20,
                         visit_prefix='',
                         visit_axis_bound=None,
                         maze=None,
                         scaling=2):
        if 'env_infos' not in paths[0].keys(
        ) or 'com' not in paths[0]['env_infos'].keys():
            raise KeyError(
                "No 'com' key in your path['env_infos']: please change you step function"
            )
        fig, ax = plt.subplots()
        # now we will grid the space and check how much of it the policy is covering
        x_max = np.int(
            np.ceil(
                np.max(
                    np.abs(
                        np.concatenate([
                            path["env_infos"]['com'][:, 0] for path in paths
                        ])))))
        y_max = np.int(
            np.ceil(
                np.max(
                    np.abs(
                        np.concatenate([
                            path["env_infos"]['com'][:, 1] for path in paths
                        ])))))
        furthest = max(x_max, y_max)
        print(
            'THE FUTHEST IT WENT COMPONENT-WISE IS: x_max={}, y_max={}'.format(
                x_max, y_max))
        if visit_axis_bound is None:
            visit_axis_bound = self.visit_axis_bound
        if visit_axis_bound and visit_axis_bound >= furthest:
            furthest = max(furthest, visit_axis_bound)
        # if maze:
        #     x_max = max(scaling * len(
        #         maze) / 2. - 1, x_max)  # maze enlarge plot to include the walls. ASSUME ROBOT STARTS IN CENTER!
        #     y_max = max(scaling * len(maze[0]) / 2. - 1, y_max)  # the max here should be useless...
        #     print("THE MAZE LIMITS ARE: x_max={}, y_max={}".format(x_max, y_max))
        delta = 1. / mesh_density
        y, x = np.mgrid[-furthest:furthest + delta:delta,
                        -furthest:furthest + delta:delta]

        if 'agent_infos' in list(paths[0].keys()) and (
            ('latents' in list(paths[0]['agent_infos'].keys())
             and np.size(paths[0]['agent_infos']['latents'])) or
            ('selectors' in list(paths[0]['agent_infos'].keys())
             and np.size(paths[0]['agent_infos']['selectors']))):
            selectors_name = 'selectors' if 'selectors' in list(
                paths[0]['agent_infos'].keys()) else 'latents'
            dict_visit = collections.OrderedDict(
            )  # keys: latents, values: np.array with number of visitations
            # num_latents = np.size(paths[0]["agent_infos"][selectors_name][0][0])、
            num_latents = 6
            # print("num_latents", num_latents)
            # set all the labels for the latents and initialize the entries of dict_visit
            for i in range(num_latents):  # use integer to define the latents
                dict_visit[i] = np.zeros((2 * furthest * mesh_density + 1,
                                          2 * furthest * mesh_density + 1))

            # keep track of the overlap
            overlap = 0
            # now plot all the paths
            for path in paths:
                lats = [
                    np.argmax(lat, axis=-1)
                    for lat in path['agent_infos'][selectors_name]
                ]  # list of all lats by idx
                com_x = np.ceil(
                    ((np.array(path['env_infos']['com'][:, 0]) + furthest) *
                     mesh_density)).astype(int)
                com_y = np.ceil(
                    ((np.array(path['env_infos']['com'][:, 1]) + furthest) *
                     mesh_density)).astype(int)
                coms = list(zip(com_x, com_y))
                if not type(lats[0]) == numpy.int64:
                    lats = np.concatenate(lats)
                for i, com in enumerate(coms):
                    if i >= len(lats):
                        break
                    if lats[i] > 5:
                        print("lats", lats)
                    else:
                        dict_visit[lats[i]][com] += 1

            # fix the colors for each latent
            num_colors = num_latents + 2  # +2 for the 0 and Repetitions NOT COUNTING THE WALLS
            cmap = plt.get_cmap('nipy_spectral',
                                num_colors)  # add one color for the walls
            # create a matrix with entries corresponding to the latent that was there (or other if several/wall/nothing)
            visitation_by_lat = np.zeros((2 * furthest * mesh_density + 1,
                                          2 * furthest * mesh_density + 1))
            for i, visit in dict_visit.items():
                lat_visit = np.where(visit == 0, visit,
                                     i + 1)  # transform the map into 0 or i+1
                visitation_by_lat += lat_visit
                overlap += np.sum(np.where(visitation_by_lat > lat_visit)
                                  )  # add the overlaps of this latent
                visitation_by_lat = np.where(visitation_by_lat <= i + 1,
                                             visitation_by_lat,
                                             num_colors - 1)  # mark overlaps
            # if maze:  # remember to also put a +1 for cmap!!
            #     for row in range(len(maze)):
            #         for col in range(len(maze[0])):
            #             if maze[row][col] == 1:
            #                 wall_min_x = max(0, (row - 0.5) * mesh_density * scaling)
            #                 wall_max_x = min(2 * furthest * mesh_density * scaling + 1,
            #                                  (row + 0.5) * mesh_density * scaling)
            #                 wall_min_y = max(0, (col - 0.5) * mesh_density * scaling)
            #                 wall_max_y = min(2 * furthest * mesh_density * scaling + 1,
            #                                  (col + 0.5) * mesh_density * scaling)
            #                 visitation_by_lat[wall_min_x: wall_max_x,
            #                 wall_min_y: wall_max_y] = num_colors
            #     gx_min, gfurthest, gy_min, gfurthest = self._find_goal_range()
            #     ax.add_patch(patches.Rectangle(
            #         (gx_min, gy_min),
            #         gfurthest - gx_min,
            #         gfurthest - gy_min,
            #         edgecolor='g', fill=False, linewidth=2,
            #     ))
            #     ax.annotate('G', xy=(0.5*(gx_min+gfurthest), 0.5*(gy_min+gfurthest)), color='g', fontsize=20)
            map_plot = ax.pcolormesh(
                x,
                y,
                visitation_by_lat,
                cmap=cmap,
                vmin=0.1,
                vmax=num_latents +
                1)  # before 1 (will it affect when no walls?)
            color_len = (num_colors - 1.) / num_colors
            ticks = np.arange(color_len / 2., num_colors - 1, color_len)
            cbar = fig.colorbar(map_plot, ticks=ticks)
            # print("dict_visit_key", dict_visit.keys())
            latent_tick_labels = [
                'latent: ' + str(i) for i in list(dict_visit.keys())
            ]
            # print("latent_tick_labels", latent_tick_labels)
            cbar.ax.set_yticklabels(['No visitation'] + latent_tick_labels +
                                    ['Repetitions'])  # horizontal colorbar
            # still log the total visitation
            visitation_all = reduce(np.add,
                                    [visit for visit in dict_visit.values()])
        else:
            visitation_all = np.zeros((2 * furthest * mesh_density + 1,
                                       2 * furthest * mesh_density + 1))
            for path in paths:
                com_x = np.ceil(
                    ((np.array(path['env_infos']['com'][:, 0]) + furthest) *
                     mesh_density)).astype(int)
                com_y = np.ceil(
                    ((np.array(path['env_infos']['com'][:, 1]) + furthest) *
                     mesh_density)).astype(int)
                coms = list(zip(com_x, com_y))
                for com in coms:
                    visitation_all[com] += 1

            plt.pcolormesh(x, y, visitation_all, vmax=mesh_density)
            overlap = np.sum(
                np.where(visitation_all > 1, visitation_all,
                         0))  # sum of all visitations larger than 1
        ax.set_xlim([x[0][0], x[0][-1]])
        ax.set_ylim([y[0][0], y[-1][0]])

        log_dir = logger.get_snapshot_dir()
        exp_name = log_dir.split('/')[-1] if log_dir else '?'
        ax.set_title(visit_prefix + 'visitation: ' + exp_name)

        # print("log_dir", log_dir)
        # print("visit_prefix", visit_prefix)
        if log_dir is None:
            log_dir = '/home/wr1/rllab/data/local/transfer/'
        plt.savefig(osp.join(
            log_dir, visit_prefix +
            'visitation.png'))  # this saves the current figure, here f
        plt.close()

        with logger.tabular_prefix(visit_prefix):
            total_visitation = np.count_nonzero(visitation_all)
            logger.record_tabular('VisitationTotal', total_visitation)
            logger.record_tabular('VisitationOverlap', overlap)

        ####
        # This was giving some problem with matplotlib and maximum number of colors
        ####
        # # now downsample the visitation
        # for down in [5, 10, 20]:
        #     visitation_down = np.zeros(tuple((i//down for i in visitation_all.shape)))
        #     delta_down = delta * down
        #     y_down, x_down = np.mgrid[-furthest:furthest+delta_down:delta_down, -furthest:furthest+delta_down:delta_down]
        #     for i, row in enumerate(visitation_down):
        #         for j, v in enumerate(row):
        #             visitation_down[i, j] = np.sum(visitation_all[down*i:down*(1+i), down*j:down*(j+1)])
        #     plt.figure()
        #     plt.pcolormesh(x_down, y_down, visitation_down, vmax=mesh_density)
        #     plt.title('Visitation_down')
        #     plt.xlim([x_down[0][0], x_down[0][-1]])
        #     plt.ylim([y_down[0][0], y_down[-1][0]])
        #     plt.title('visitation_down{}: {}'.format(down, exp_name))
        #     plt.savefig(osp.join(log_dir, 'visitation_down{}.png'.format(down)))
        #     plt.close()
        #
        #     total_visitation_down = np.count_nonzero(visitation_down)
        #     overlap_down = np.sum(np.where(visitation_down > 1, 1, 0))  # sum of all visitations larger than 1
        #     logger.record_tabular('VisitationTotal_down{}'.format(down), total_visitation_down)
        #     logger.record_tabular('VisitationOverlap_down{}'.format(down), overlap_down)

        plt.cla()
        plt.clf()
        plt.close('all')
        # del fig, ax, cmap, cbar, map_plot
        gc.collect()
Esempio n. 19
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']
    samples_per_cell = 10  # for the oracle rejection sampling

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(
        PointMazeEnv(maze_id=v['maze_id'], length=v['maze_length']))
    #inner_env = normalize(PointEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'],
                                                    bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    # initialize all logging arrays on itr0
    outer_iter = 0

    # TODO - show initial states for Alice
    report.new_row()

    ring_spacing = 1
    init_iter = 2

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    # Use a double horizon because the horizon is shared between Alice and Bob.
    env_alice = AliceFakeEnv(env,
                             max_path_length=v['alice_horizon'],
                             alice_factor=v['alice_factor'],
                             alice_bonus=v['alice_bonus'],
                             gamma=1,
                             stop_threshold=v['stop_threshold'],
                             ring_spacing=ring_spacing,
                             init_iter=init_iter)

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['alice_horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        # if outer_iter > 10:
        #     init_iter = 5
        #env_alice.set_iter(init_iter)
        #import pdb; pdb.set_trace()

        print("Init iter: " + str(init_iter))

        env_alice = AliceFakeEnv(env,
                                 max_path_length=v['alice_horizon'],
                                 alice_factor=v['alice_factor'],
                                 alice_bonus=v['alice_bonus'],
                                 gamma=1,
                                 stop_threshold=v['stop_threshold'],
                                 ring_spacing=ring_spacing,
                                 init_iter=init_iter)
        algo_alice.env = env_alice

        #env_alice.set_iter(outer_iter)

        starts, t_alices = generate_starts_alice(
            env_alice=env_alice,
            algo_alice=algo_alice,
            start_states=[v['start_goal']],
            num_new_starts=v['num_new_starts'],
            log_dir=log_dir)

        # Make fake labels
        labels = np.ones([len(starts), 2])
        radius = init_iter * ring_spacing
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'],
                            summary_string_base='initial starts labels:\n',
                            radius=radius)
        report.save()

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.dump_tabular(with_prefix=False)
        report.new_row()
Esempio n. 20
0
    def train(self, sess=None):
        if sess is None:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            sess.__enter__()
            sess.run(tf.initialize_all_variables())
        else:
            sess.run(
                tf.initialize_variables(
                    list(
                        tf.get_variable(name) for name in sess.run(
                            tf.report_uninitialized_variables()))))

        self.start_worker()
        start_time = time.time()

        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()

            with logger.prefix('itr #%d | ' % itr):
                all_paths = []
                logger.log("Obtaining samples...")
                for sampler in self.local_samplers:
                    all_paths.append(sampler.obtain_samples(itr))

                logger.log("Processing samples...")
                all_samples_data = []
                for n, (sampler,
                        paths) in enumerate(zip(self.local_samplers,
                                                all_paths)):
                    with logger.tabular_prefix(str(n)):
                        all_samples_data.append(
                            sampler.process_samples(itr, paths))

                logger.log("Logging diagnostics...")
                self.log_diagnostics(all_paths, )

                if self.should_optimize_policy:
                    logger.log("Optimizing policy...")
                    self.optimize_policy(itr, all_samples_data)

                if not self.test_env is None:
                    logger.log("Obtaining test samples...")
                    test_paths = self.test_sampler.obtain_samples(itr)
                    with logger.tabular_prefix("Test"):
                        test_samples = self.test_sampler.process_samples(
                            itr, test_paths)
                        logger.record_tabular(
                            "TestSuccessRate",
                            np.mean(test_samples["env_infos"]["success"]))

                successes = 0.0
                trials = 0.0
                for i, samples_data in enumerate(all_samples_data):
                    success = samples_data["env_infos"]["success"]
                    logger.record_tabular("SuccessRate{}".format(i),
                                          np.mean(success))
                    successes += np.sum(success)
                    trials += success.shape[0]

                success_rate = successes / trials
                logger.record_tabular("SuccessRate", success_rate)

                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               all_samples_data)  # , **kwargs)
                logger.save_itr_params(itr, params)

                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)

        self.shutdown_worker()
Esempio n. 21
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['start_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:
                                       ],  # the goal are the last 9 coords
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=v['policy_hidden_sizes'],
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['baseline'] == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v['pg_batch_size'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters'],
        step_size=0.01,
        discount=v['discount'],
        plot=False,
    )

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'),
            'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb'))

    # all_feasible_starts2 = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_min_rad4.pkl'), 'rb'))
    # all_feasible_starts3 = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_max_rad2.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    brownian_starts = StateCollection(
        distance_threshold=v['regularize_starts'])

    logger.log(
        'Generating seed starts from the goal (horizon 10, subsample 600 of them)'
    )
    with algo.env.set_kill_outside(radius=v['kill_radius']):
        seed_starts = generate_starts(
            env,
            starts=[v['start_goal']],
            horizon=10,  # this is smaller as they are seeds!
            variance=v['brownian_variance'],
            subsample=v['num_new_starts'])  # , animated=True, speedup=10)

        # seed_starts = all_feasible_starts.states
        # with env.set_kill_outside(radius=0.4):
        # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'],
    #                 zero_action=True, animated=True, speedup=10)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        with algo.env.set_kill_outside(radius=v['kill_radius']):
            starts = generate_starts(algo.env,
                                     starts=seed_starts,
                                     horizon=v['brownian_horizon'],
                                     variance=v['brownian_variance'])
        # regularization of the brownian starts
        brownian_starts.empty()
        brownian_starts.append(starts)
        starts = brownian_starts.sample(size=v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              50 * (outer_iter // 50 + 1),
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            algo.env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))
            # algo.start_worker()

            logger.log("Training the algorithm")

            algo.current_itr = 0
            trpo_paths = algo.train(already_init=outer_iter > 1)

        # import pdb; pdb.set_trace()
        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=algo.env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         algo.env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            algo.env.log_diagnostics(paths)

        logger.record_tabular('brownian_starts', brownian_starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_4med_"):
            unif_starts = all_feasible_starts.sample(500)
            unif_starts = np.pad(unif_starts,
                                 ((0, v['start_size'] - unif_starts.shape[1])),
                                 'constant')
            mean_reward, paths = evaluate_states(unif_starts,
                                                 algo.env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            algo.env.log_diagnostics(paths)
        # with logger.tabular_prefix("Uniform_4med_bis_"):
        #     unif_starts = all_feasible_starts.sample(200)
        #     unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant')
        #     mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1,
        #                                                  key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths1bis)
        # with logger.tabular_prefix("Uniform_4min_"):
        #     unif_starts2 = all_feasible_starts2.sample(200)
        #     unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant')
        #     mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths2)
        # with logger.tabular_prefix("Uniform_2max_"):
        #     unif_starts3 = all_feasible_starts3.sample(200)
        #     unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant')
        #     mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths3)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer):
        if v['seed_with'] == 'only_goods':
            logger.log("Appending good goals to replay and generating seeds")
            filtered_raw_starts = [
                start for start, label in zip(starts, labels) if label[0] == 1
            ]
            all_starts.append(filtered_raw_starts)
            if len(filtered_raw_starts) > 0:
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:  # add a tone of noise if all the states I had ended up being high_reward!
                with algo.env.set_kill_outside(radius=v['kill_radius']):
                    seed_starts = generate_starts(
                        algo.env,
                        starts=starts,
                        horizon=int(v['horizon'] * 10),
                        subsample=v['num_new_starts'],
                        variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            logger.log("Appending all goals to replay and generating seeds")
            all_starts.append(starts)
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            all_starts.append(starts)
            with algo.env.set_kill_outside(radius=v['kill_radius']):
                seed_starts = generate_starts(algo.env,
                                              policy,
                                              horizon=v['horizon'],
                                              subsample=v['num_new_starts'])
Esempio n. 22
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=2)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    uniform_start_generator = UniformListStateGenerator(
        state_list=all_feasible_starts.state_list)

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    env.update_start_generator(uniform_start_generator)
    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            # env.update_start_generator(uniform_start_generator)
            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )
            algo.train()

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=3,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(init_pos,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=5,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(init_pos,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
Esempio n. 23
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # create Alice

    env_alice = AliceEnv(env_alice=env,
                         env_bob=env,
                         policy_bob=policy,
                         max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'],
                         alice_bonus=v['alice_bonus'],
                         gamma=1,
                         stop_threshold=v['stop_threshold'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    if v["baseline"] == "MLP":
        baseline_alice = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline_alice = LinearFeatureBaseline(env_spec=env.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'],
                                 states_transform=lambda x: x[:, :2])

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        starts, t_alices = generate_starts_alice(
            env_alice=env_alice,
            algo_alice=algo_alice,
            start_states=[v['start_goal']],
            num_new_starts=v['num_new_starts'],
            log_dir=log_dir)

        if v['filter_bad_starts']:
            logger.log("Prefilter starts: {}".format(len(starts)))
            starts = parallel_check_feasibility(
                env=env,
                starts=starts,
                max_path_length=v['feasibility_path_length'])
            logger.log("Filtered starts: {}".format(len(starts)))

        logger.log("Total number of starts in buffer: {}".format(
            all_starts.size))
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log("Labeling the starts")
        [starts, labels] = label_states_from_paths(
            trpo_paths,
            n_traj=v['n_traj'],
            key='goal_reached',  # using the min n_traj
            as_goal=False,
            env=env)
        # labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        if len(
                filtered_raw_starts
        ) == 0:  # add a tone of noise if all the states I had ended up being high_reward!
            logger.log("Bad Alice!  All goals are high reward!")

        all_starts.append(filtered_raw_starts)

        # Useful plotting and metrics (basic test set)
        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=v['n_traj'],
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            # report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(init_pos,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=5,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(init_pos,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
Esempio n. 24
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    save_dir = 'data/debug/'
    # with open(os.path.join(config.PROJECT_PATH, save_dir, "test.pkl"), 'wb') as handle:
    #     pickle.dump({}, handle)

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'],
                                 states_transform=lambda x: x[:, :2])

    # initial brownian horizon and size are pretty important
    logger.log("Brownian horizon: {}".format(v['initial_brownian_horizon']))
    seed_starts = generate_starts(
        env,
        starts=[v['start_goal']],
        horizon=v['initial_brownian_horizon'],
        size=15000,
        variance=v['brownian_variance'],
        animated=False,
    )

    if v['filter_bad_starts']:
        logger.log("Prefilter seed starts: {}".format(len(seed_starts)))
        seed_starts = parallel_check_feasibility(
            env=env,
            starts=seed_starts,
            max_path_length=v['feasibility_path_length'])
        logger.log("Filtered seed starts: {}".format(len(seed_starts)))

    # can also filter these starts optionally

    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb'))
    # logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    with open(osp.join(log_dir, 'init_pos.json'), 'w') as f:
        json.dump(init_pos.tolist(), f)

    for outer_iter in range(1, v['outer_iters'] + 1):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        # generate starts from the previous seed starts, which are defined below
        starts = generate_starts(env,
                                 starts=seed_starts,
                                 subsample=v['num_new_starts'],
                                 size=2000,
                                 horizon=v['brownian_horizon'],
                                 variance=v['brownian_variance'])

        # note: this messes with the balance between starts and old_starts!
        if v['filter_bad_starts']:
            logger.log("Prefilter starts: {}".format(len(starts)))
            starts = parallel_check_feasibility(
                env=env,
                starts=starts,
                max_path_length=v['feasibility_path_length'])
            logger.log("Filtered starts: {}".format(len(starts)))

        logger.log("Total number of starts in buffer: {}".format(
            all_starts.size))
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            # with open(os.path.join(config.PROJECT_PATH, save_dir, "qval{}.pkl".format(outer_iter)), 'wb') as handle:
            #     pickle.dump(all_starts.q_vals, handle)
            # with open(os.path.join(config.PROJECT_PATH, save_dir, "preval{}.pkl".format(outer_iter)), 'wb') as handle:
            #     pickle.dump(all_starts.prev_vals, handle)
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        # plot starts before training
        # takes too much time
        # labels = label_states(starts, env, policy, v['horizon'],
        #                       as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        # plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
        #                     center=v['goal_center'], maze_id=v['maze_id'],
        #                     summary_string_base='initial starts labels:\n')

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        logger.log("Labeling the starts")

        [starts, labels] = label_states_from_paths(trpo_paths,
                                                   n_traj=v['n_traj'],
                                                   key='goal_reached',
                                                   as_goal=False,
                                                   env=env)

        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(
                    filtered_raw_starts
            ) > 0:  # add a ton of noise if all the states I had ended up being high_reward!
                logger.log("We have {} good starts!".format(
                    len(filtered_raw_starts)))
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                logger.log(
                    "More bad starts than good starts, sampling seeds from replay buffer"
                )
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:
                logger.log("More good starts than bad starts, resampling")
                seed_starts = generate_starts(env,
                                              starts=starts,
                                              horizon=v['horizon'] * 2,
                                              subsample=v['num_new_starts'],
                                              size=10000,
                                              variance=v['brownian_variance'] *
                                              10)

        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
            filtered_raw_starts = starts  # no filtering done
        else:
            raise Exception

        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        if not v["debug"]:
            # with logger.tabular_prefix("Uniform_"):
            #     unif_starts = all_feasible_starts.sample(100)
            #     mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached',
            #                                          as_goals=False, full_path=True)
            #     env.log_diagnostics(paths)
            #     mean_rewards = mean_reward.reshape(-1, 1)
            #     labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
            #                             improvement_threshold=improvement_threshold)
            #     logger.log("Starts labelled")
            #     plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
            #                         center=v['goal_center'], maze_id=v['maze_id'],
            #                         summary_string_base='initial starts labels:\n')
            #     report.add_text("Uniform Success: " + str(np.mean(mean_reward)))

            with logger.tabular_prefix("Fixed_"):
                mean_reward, paths = evaluate_states(init_pos,
                                                     env,
                                                     policy,
                                                     v['horizon'],
                                                     n_traj=5,
                                                     key='goal_reached',
                                                     as_goals=False,
                                                     full_path=True)

                with open(
                        osp.join(log_dir,
                                 'init_pos_per_state_mean_return.csv'),
                        'a') as f:
                    writer = csv.writer(f)
                    row = [outer_iter] + list(mean_reward)
                    writer.writerow(row)

                env.log_diagnostics(paths)
                mean_rewards = mean_reward.reshape(-1, 1)
                labels = compute_labels(
                    mean_rewards,
                    old_rewards=old_rewards,
                    min_reward=min_reward,
                    max_reward=max_reward,
                    improvement_threshold=improvement_threshold)
                logger.log("Starts labelled")
                plot_labeled_states(
                    init_pos,
                    labels,
                    report=report,
                    itr=outer_iter,
                    limit=v['goal_range'],
                    center=v['goal_center'],
                    maze_id=v['maze_id'],
                    summary_string_base='initial starts labels:\n')
                report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

            report.new_row()
            report.save()
            logger.record_tabular("Fixed test set_success: ",
                                  np.mean(mean_reward))
            logger.dump_tabular()

        if outer_iter == 1 or outer_iter % 5 == 0 and v.get(
                'scratch_dir', False):
            command = 'rsync -a --delete {} {}'.format(
                os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
            print("Running command:\n{}".format(command))
            subprocess.run(command.split(), check=True)

    if v.get('scratch_dir', False):
        command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''),
                                          os.path.join(v['scratch_dir'], ''))
        print("Running command:\n{}".format(command))
        subprocess.run(command.split(), check=True)
Esempio n. 25
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env, goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                         itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                         bounds=v['goal_range'])
    logger.log('Saving to report')
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    # Use a double horizon because the horizon is shared between Alice and Bob.
    env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1,
                         stop_threshold=v['stop_threshold'], start_generation=False)

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )

    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)
    if v['baseline'] == 'g_mlp':
        baseline_alice = GaussianMLPBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['alice_horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        raw_goals, t_alices = generate_states_alice(env_alice=env_alice, algo_alice=algo_alice,
                                                    num_new_states=v['num_new_goals'], log_dir=log_dir,
                                                    start_generation=False)

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(v['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            all_paths = algo.train()

        [goals, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached')

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                             itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                             bounds=v['goal_range'])

        # logger.log("Labeling the goals")
        # labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1]
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                    horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
Esempio n. 26
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']
    samples_per_cell = 10  # for the oracle rejection sampling

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['constant_baseline']:
        logger.log("Using constant baseline")
        baseline = ConstantBaseline(env_spec=env.spec, value=1.0)
    else:
        logger.log("Using linear baseline")
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center'])
    test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter, report=report, center=v['goal_center'],
                         limit=v['goal_range'])  # use goal for plot
    report.new_row()

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'],
                                 horizon=v['brownian_horizon'], variance=v['brownian_variance'])
        labels = label_states(starts, env, policy, v['horizon'],
                              as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'],
                            summary_string_base='initial starts labels:\n')
        report.save()

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
                                                       as_goal=False, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'],
                                         key='goal_reached', full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        logger.log('Generating the Heatmap...')
        plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center'])
        test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range'])

        logger.log("Labeling the starts")
        #labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        start_classes, text_labels = convert_label(labels)

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]
        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:  # add a tone of noise if all the states I had ended up being high_reward!
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(300)  # sample them from the replay
            else:
                seed_starts = generate_starts(env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'],
                                                  variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])
Esempio n. 27
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.baseline, "predict_n"):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [self.baseline.predict(path) for path in paths]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"], self.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(
            np.concatenate(baselines),
            np.concatenate(returns)
        )

        # if not self.algo.policy.recurrent:
        observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
        actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
        rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
        returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths])
        advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
        env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
        agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

        if self.center_adv:
            advantages = util.center_advantages(advantages)

        if self.positive_adv:
            advantages = util.shift_advantages_to_positive(advantages)

        average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])

        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        ent = np.mean(self.policy.distribution.entropy(agent_infos))

        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            returns=returns,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
            paths=paths,
        )


        logger.log("fitting baseline...")
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)
        logger.log("fitted")

        with logger.tabular_prefix('Low_'):
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('AverageDiscountedReturn',
                                  average_discounted_return)
            logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
            logger.record_tabular('ExplainedVariance', ev)
            logger.record_tabular('NumTrajs', len(paths))
            logger.record_tabular('Entropy', ent)
            logger.record_tabular('Perplexity', np.exp(ent))
            logger.record_tabular('StdReturn', np.std(undiscounted_returns))
            logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
            logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Esempio n. 28
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    #baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2])

    # can also filter these starts optionally

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    # hardest to easiest
    init_pos = [[0, 0],
                [1, 0],
                [2, 0],
                [3, 0],
                [4, 0],
                [4, 1],
                [4, 2],
                [4, 3],
                [4, 4],
                [3, 4],
                [2, 4],
                [1, 4]
                ][::-1]
    for pos in init_pos:
        pos.extend([0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ])
    array_init_pos = np.array(init_pos)
    init_pos = [tuple(pos) for pos in init_pos]
    online_start_generator = Online_TCSL(init_pos)


    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        # generate starts from the previous seed starts, which are defined below
        dist = online_start_generator.get_distribution() # added
        logger.log(np.array_str(online_start_generator.get_q()))
        # how to log Q values?
        # with logger.tabular_prefix("General: "):
        #     logger.record_tabular("Q values:", online_start_generator.get_q())
        logger.log(np.array_str(dist))

        # Following code should be indented
        with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            #TODO: might be faster to sample if we just create a roughly representative UniformListStateGenerator?
            env.update_start_generator(
                ListStateGenerator(
                    init_pos, dist
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()



        logger.log("Labeling the starts")
        [starts, labels, mean_rewards, updated] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached',  # using the min n_traj
                                                   as_goal=False, env=env, return_mean_rewards=True, order_of_states=init_pos)

        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'])

        online_start_generator.update_q(np.array(mean_rewards), np.array(updated)) # added
        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:  # add a ton of noise if all the states I had ended up being high_reward!
                logger.log("We have {} good starts!".format(len(filtered_raw_starts)))
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                logger.log("More bad starts than good starts, sampling seeds from replay buffer")
                seed_starts = all_starts.sample(300)  # sample them from the replay
            else:
                logger.log("More good starts than bad starts, resampling")
                seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000,
                                              variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
        else:
            raise Exception

        all_starts.append(filtered_raw_starts)

        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached',
                                                 as_goals=False, full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
                                    improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                                center=v['goal_center'], maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            # report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(array_init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached',
                                                 as_goals=False, full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
                                    improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(array_init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                                center=v['goal_center'], maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
Esempio n. 29
0
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     penalty_sum = sum([path['env_infos']['penalty'].sum() for path in paths])
     with logger.tabular_prefix(log_prefix):
         logger.record_tabular("Penalty", penalty_sum)
Esempio n. 30
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0
    all_starts = StateCollection(distance_threshold=v['coll_eps'])

    # seed_starts: from which we will be performing brownian motion exploration
    seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts'])

    def plot_states(states, report, itr, summary_string, **kwargs):
        states = np.array(states)
        if states.size == 0:
            states = np.zeros((1, 2))
        img = plot_labeled_samples(
            states, np.zeros(len(states), dtype='uint8'), markers={0: 'o'}, text_labels={0: "all"}, **kwargs)
        report.add_image(img, 'itr: {}\n{}'.format(itr, summary_string), width=500)

    for outer_iter in range(1, v['outer_iters']):
        report.new_row()

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        plot_states(
            seed_starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'],
            maze_id=v['maze_id'], summary_string="seed starts")

        starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'],
                                 horizon=v['brownian_horizon'], variance=v['brownian_variance'])

        plot_states(
            starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'],
            maze_id=v['maze_id'], summary_string="brownian starts")

        sampled_from_buffer = []
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            sampled_from_buffer = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, sampled_from_buffer])

        plot_states(
            sampled_from_buffer, report=report, itr=outer_iter, limit=v['goal_range'],
            center=v['goal_center'], maze_id=v['maze_id'], summary_string="states sampled from buffer")

        labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'],
                            summary_string_base='all starts before update\n')

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths, n_traj=2, key='goal_reached', as_goal=False, env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(
                starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True)

        start_classes, text_labels = convert_label(labels)

        plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'], maze_id=v['maze_id'],
                            summary_string_base="all starts after update\n")

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1]

        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(filtered_raw_starts) > 0:
                logger.log("Only goods A")
                seed_starts = filtered_raw_starts

            elif np.sum(start_classes == 0) > np.sum(start_classes == 1):  # if more low reward than high reward
                logger.log("Only goods B")
                seed_starts = all_starts.sample(300)  # sample them from the replay

            else:
                logger.log("Only goods C")
                # add a ton of noise if all the states I had ended up being high_reward
                seed_starts = generate_starts(
                    env, starts=starts, horizon=int(v['horizon'] * 10),
                    subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10)

        elif v['seed_with'] == 'all_previous':
            seed_starts = starts

        elif v['seed_with'] == 'on_policy':
            seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])

        logger.log('Generating Heatmap...')
        plot_policy_means(
            policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center'])

        _, _, states, returns, successes = test_and_plot_policy2(
            policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
            itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range'])

        eval_state_path = osp.join(log_dir, "eval_states.json")
        if not osp.exists(eval_state_path):
            with open(eval_state_path, 'w') as f:
                json.dump(np.array(states).tolist(), f)

        with open(osp.join(log_dir, 'eval_pos_per_state_mean_return.csv'), 'a') as f:
            writer = csv.writer(f)
            row = [outer_iter] + list(returns)
            writer.writerow(row)

        with open(osp.join(log_dir, 'eval_pos_per_state_mean_success.csv'), 'a') as f:
            writer = csv.writer(f)
            row = [outer_iter] + list(successes)
            writer.writerow(row)

        logger.dump_tabular()

        report.save()

        if outer_iter == 1 or outer_iter % 5 == 0 and v.get('scratch_dir', False):
            command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
            print("Running command:\n{}".format(command))
            subprocess.run(command.split(), check=True)

    if v.get('scratch_dir', False):
        command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
        print("Running command:\n{}".format(command))
        subprocess.run(command.split(), check=True)
Esempio n. 31
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']
    samples_per_cell = 10  # for the oracle rejection sampling

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'],
                                                    bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    plot_policy_means(policy,
                      env,
                      sampling_res=sampling_res,
                      report=report,
                      limit=v['goal_range'],
                      center=v['goal_center'])
    test_and_plot_policy(policy,
                         env,
                         as_goals=False,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         center=v['goal_center'],
                         limit=v['goal_range'])
    report.new_row()

    all_starts = StateCollection(distance_threshold=v['coll_eps'])

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    # Use a double horizon because the horizon is shared between Alice and Bob.
    env_alice = AliceEnv(env_alice=env,
                         env_bob=env,
                         policy_bob=policy,
                         max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'],
                         alice_bonus=v['alice_bonus'],
                         gamma=1,
                         stop_threshold=v['stop_threshold'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['alice_horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        starts, t_alices = generate_starts_alice(
            env_alice=env_alice,
            algo_alice=algo_alice,
            start_states=[v['start_goal']],
            num_new_starts=v['num_new_starts'],
            log_dir=log_dir)

        labels = label_states(starts,
                              env,
                              policy,
                              v['horizon'],
                              as_goals=False,
                              n_traj=v['n_traj'],
                              key='goal_reached')
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'],
                            summary_string_base='initial starts labels:\n')
        report.save()

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=v['step_size'],
                discount=v['discount'],
                plot=False,
            )

            # We don't use these labels anyway, so we might as well take them from training.
            #trpo_paths = algo.train()
            algo.train()

        # logger.log("labeling starts with trpo rollouts")
        # [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
        #                                            as_goal=False, env=env)
        # paths = [path for paths in trpo_paths for path in paths]

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log('Generating the Heatmap...')
        plot_policy_means(policy,
                          env,
                          sampling_res=sampling_res,
                          report=report,
                          limit=v['goal_range'],
                          center=v['goal_center'])
        test_and_plot_policy(policy,
                             env,
                             as_goals=False,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             center=v['goal_center'],
                             limit=v['goal_range'])

        logger.log("Labeling the starts")
        labels = label_states(starts,
                              env,
                              policy,
                              v['horizon'],
                              as_goals=False,
                              n_traj=v['n_traj'],
                              key='goal_reached')

        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]

        if len(
                filtered_raw_starts
        ) == 0:  # add a tone of noise if all the states I had ended up being high_reward!
            logger.log("Bad Alice!  All goals are high reward!")

        #     seed_starts = filtered_raw_starts
        # else:
        #     seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'],
        #                                   variance=v['brownian_variance'] * 10)
        all_starts.append(filtered_raw_starts)