Exemple #1
0
    def train(self, dataset, test_dataset=None, max_epochs=10000, save_step=1000, print_step=1,  plot_step=1,
              record_stats=False):

        for epoch in range(1, max_epochs + 1):
            stats = self.train_epoch(dataset, epoch)
            test = self.train_epoch(test_dataset, epoch, train=False)
            for k, v in test.items():
                stats['V ' + k] = v
            stats['Test RL'] = self.test_pd(test_dataset)

            if epoch % print_step == 0:
                with logger.prefix('itr #%d | ' % epoch):
                    self.print_diagnostics(stats)

            if epoch % plot_step == 0:
                self.plot_compare(dataset, epoch)
                self.plot_interp(dataset, epoch)
                self.plot_compare(test_dataset, epoch, save_dir='test')
                self.plot_random(dataset, epoch)

            if epoch % save_step == 0 and logger.get_snapshot_dir() is not None:
                self.save(logger.get_snapshot_dir() + '/snapshots/', epoch)

            if record_stats:
                with logger.prefix('itr #%d | ' % epoch):
                    self.log_diagnostics(stats)
                    logger.dump_tabular()

        return stats
Exemple #2
0
 def __init__(self, env_name, n_envs, envs=None, random_action_p=0, **kwargs):
     super().__init__(**kwargs)
     self.n_envs = n_envs
     if envs is None:
         envs = SubprocVecEnv([make_env(env_name, 1, i, logger.get_snapshot_dir()) for i in range(n_envs)])
     self.envs = envs
     self.random_action_p = random_action_p
Exemple #3
0
    def train(
            self,
            dataset,  #main training dataset
            test_dataset,  #main validation dataset
            dummy_dataset,  #dataset containing only data from the current iteration
            joint_training,  #whether training should happen on one dataset at a time or jointly
            max_itr=1000,
            save_step=10,
            train_vae_after_add=10,  #how many times to train the vae after exploring
            #unused
        plot_step=0,
            record_stats=True,
            print_step=True,
            start_itr=0,
            add_size=0,
            add_interval=0):

        for itr in range(1, max_itr + 1):
            if itr % save_step == 0 and logger.get_snapshot_dir() is not None:
                self.save(logger.get_snapshot_dir() + '/snapshots', itr)
                np.save(logger.get_snapshot_dir() + '/snapshots/traindata',
                        self.dataset.train_data)

            # run mpc + explorer and collect data + stats
            stats = self.train_explorer(dataset, test_dataset, dummy_dataset,
                                        itr)
            with logger.prefix('itr #%d | ' % (itr)):
                self.vae.print_diagnostics(stats)
            record_tabular(stats, 'ex_stats.csv')

            # fit the VAE on newly collected data and replay buffer
            for vae_itr in range(train_vae_after_add):
                if joint_training:
                    vae_stats = self.train_vae_joint(dataset, dummy_dataset,
                                                     test_dataset, itr,
                                                     vae_itr)
                else:
                    # vae_stats = self.train_vae(dummy_dataset, None, itr, vae_itr)
                    # with logger.prefix('itr #%d vae newdata itr #%d | ' % (itr, vae_itr)):
                    #     self.vae.print_diagnostics(vae_stats)
                    # record_tabular(vae_stats, 'new_vae_stats.csv')

                    vae_stats = self.train_vae(dataset, test_dataset, itr,
                                               vae_itr)
                with logger.prefix('itr #%d vae itr #%d | ' % (itr, vae_itr)):
                    self.vae.print_diagnostics(vae_stats)
                record_tabular(vae_stats, 'vae_stats.csv')
Exemple #4
0
def record_tabular(stats, csv_file):
    file = os.path.join(logger.get_snapshot_dir(), csv_file)
    if not os.path.exists(file):
        with open(file, 'w') as f:
            f.write(','.join(stats.keys()) + '\n')
    with open(file, 'ab') as f:
        np.savetxt(f,
                   np.expand_dims(np.array([x for x in stats.values()]), 0),
                   delimiter=',')
Exemple #5
0
def setup_rllab_logging(vv):
    log_dir = trajlogger.get_snapshot_dir()
    tabular_log_file = osp.join(log_dir, 'rllprogress.csv')
    text_log_file = osp.join(log_dir, 'rlldebug.log')
    rllablogger.add_text_output(text_log_file)
    rllablogger.add_tabular_output(tabular_log_file)
    rllablogger.set_snapshot_dir(log_dir)
    rllablogger.set_snapshot_mode("gap")
    rllablogger.set_snapshot_gap(10)
    rllablogger.set_log_tabular_only(False)
    rllablogger.push_prefix("[%s] " % vv['exp_dir'])
    return log_dir
 def __init__(self, env_name, n_envs, envs=None, random_action_p=0, ego=False, egoidx=None, **kwargs):
     super().__init__(**kwargs)
     self.n_envs = n_envs
     if envs is None:
         if env_name is None:
             envs = [kwargs['env']() for i in range(n_envs)]
         else:
             envs = [make_env(env_name, int(np.random.randint(0, 10000, 1)), i, logger.get_snapshot_dir())() for i in range(n_envs)]
     self.envs = envs
     self.random_action_p = random_action_p
     self.ego = ego
     self.egoidx = egoidx
Exemple #7
0
 def train(self):
     start_time = time.time()
     for itr in range(self.start_itr, self.n_itr):
         itr_start_time = time.time()
         with logger.prefix('itr #%d | ' % itr):
             logger.log("Obtaining samples...")
             sd = self.obtain_samples(itr)
             if self.alter_sd_fn is not None:
                 self.alter_sd_fn(sd, *self.alter_sd_args)
             logger.log("Processing samples...")
             self.process_samples(itr, sd)
             logger.log("Logging diagnostics...")
             self.log_diagnostics(sd['stats'])
             logger.log("Optimizing policy...")
             self.optimize_policy(itr, sd)
             logger.record_tabular('Time', time.time() - start_time)
             logger.record_tabular('ItrTime', time.time() - itr_start_time)
             logger.dump_tabular(with_prefix=False)
         if itr % self.plot_every == 0 and self.plot and itr > self.plot_itr_threshold:
             rollout(self.policy, self.env_obj, self.max_path_length, plot=True)
         if itr % self.save_step == 0 and logger.get_snapshot_dir() is not None:
             self.save(logger.get_snapshot_dir() + '/snapshots', itr)
def run_experiment(argv):
    default_log_dir = config.LOG_DIR
    now = datetime.datetime.now(dateutil.tz.tzlocal())

    # avoid name clashes when running distributed jobs
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')

    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--n_parallel',
        type=int,
        default=1,
        help=
        'Number of parallel workers to perform rollouts. 0 => don\'t start any workers'
    )
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')
    parser.add_argument('--log_dir',
                        type=str,
                        default=None,
                        help='Path to save the log and iteration snapshot.')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument('--snapshot_gap',
                        type=int,
                        default=1,
                        help='Gap between snapshot iterations.')
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--variant_log_file',
                        type=str,
                        default='variant.json',
                        help='Name of the variant log file (in json).')
    parser.add_argument(
        '--resume_from',
        type=str,
        default=None,
        help='Name of the pickle file to resume experiment from.')
    parser.add_argument('--plot',
                        type=ast.literal_eval,
                        default=False,
                        help='Whether to plot the iteration results')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--variant_data',
                        type=str,
                        help='Pickled data for variant configuration')
    parser.add_argument('--use_cloudpickle',
                        type=ast.literal_eval,
                        default=False)

    args = parser.parse_args(argv[1:])

    if args.seed is not None:
        set_seed(args.seed)

    # if args.n_parallel > 0:
    #     from rllab.sampler import parallel_sampler
    #     parallel_sampler.initialize(n_parallel=args.n_parallel)
    #     if args.seed is not None:
    #         parallel_sampler.set_seed(args.seed)
    #
    # if args.plot:
    #     from rllab.plotter import plotter
    #     plotter.init_worker()

    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    if args.variant_data is not None:
        variant_data = pickle.loads(base64.b64decode(args.variant_data))
        variant_log_file = osp.join(log_dir, args.variant_log_file)
        logger.log_variant(variant_log_file, variant_data)
    else:
        variant_data = None

    if not args.use_cloudpickle:
        logger.log_parameters_lite(params_log_file, args)

    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_snapshot_gap(args.snapshot_gap)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    if args.resume_from is not None:
        data = joblib.load(args.resume_from)
        assert 'algo' in data
        algo = data['algo']
        algo.train()
    else:
        # read from stdin
        if args.use_cloudpickle:
            import cloudpickle
            method_call = cloudpickle.loads(base64.b64decode(args.args_data))
            method_call(variant_data)
        else:
            data = pickle.loads(base64.b64decode(args.args_data))
            maybe_iter = concretize(data)
            if is_iterable(maybe_iter):
                for _ in maybe_iter:
                    pass

    logger.set_snapshot_mode(prev_mode)
    logger.set_snapshot_dir(prev_snapshot_dir)
    logger.remove_tabular_output(tabular_log_file)
    logger.remove_text_output(text_log_file)
    logger.pop_prefix()
Exemple #9
0
def get_snapshot_dir():
    return logger.get_snapshot_dir() or _snapshot_dir or None
Exemple #10
0
def run_experiment_here(
        experiment_function,
        exp_prefix="default",
        variant=None,
        exp_id=0,
        seed=0,
        use_gpu=True,
        snapshot_mode='last',
        snapshot_gap=1,
        code_diff=None,
        commit_hash=None,
        script_name=None,
        n_parallel=0,
        base_log_dir=None,
        log_dir=None,
        exp_name=None,
):
    """
    Run an experiment locally without any serialization.

    :param experiment_function: Function. `variant` will be passed in as its
    only argument.
    :param exp_prefix: Experiment prefix for the save file.
    :param variant: Dictionary passed in to `experiment_function`.
    :param exp_id: Experiment ID. Should be unique across all
    experiments. Note that one experiment may correspond to multiple seeds,.
    :param seed: Seed used for this experiment.
    :param use_gpu: Run with GPU. By default False.
    :param script_name: Name of the running script
    :param log_dir: If set, set the log directory to this. Otherwise,
    the directory will be auto-generated based on the exp_prefix.
    :return:
    """
    if variant is None:
        variant = {}
    if seed is None and 'seed' not in variant:
        seed = random.randint(0, 100000)
        variant['seed'] = str(seed)
    if n_parallel > 0:
        from rllab.sampler import parallel_sampler
        parallel_sampler.initialize(n_parallel=n_parallel)
        parallel_sampler.set_seed(seed)
    variant['exp_id'] = str(exp_id)
    reset_execution_environment()
    set_seed(seed)
    setup_logger(
        exp_prefix=exp_prefix,
        variant=variant,
        exp_id=exp_id,
        seed=seed,
        snapshot_mode=snapshot_mode,
        snapshot_gap=snapshot_gap,
        base_log_dir=base_log_dir,
        log_dir=log_dir,
        exp_name=exp_name,
    )
    log_dir = logger.get_snapshot_dir()
    if code_diff is not None:
        with open(osp.join(log_dir, "code.diff"), "w") as f:
            f.write(code_diff)
    if commit_hash is not None:
        with open(osp.join(log_dir, "commit_hash.txt"), "w") as f:
            f.write(commit_hash)
    if script_name is not None:
        with open(osp.join(log_dir, "script_name.txt"), "w") as f:
            f.write(script_name)
    set_gpu_mode(use_gpu)

    print('variant', variant)
    return experiment_function(variant)
Exemple #11
0
    def train_explorer(self, dataset, test_dataset, dummy_dataset, itr):
        bs = self.batch_size

        # load fixed initial state and goals from config
        init_state = self.block_config[0]
        goals = np.array(self.block_config[1])

        # functions for computing the reward and initializing the reward state (rstate)
        # rstate is used to keep track of things such as which goal you are currently on
        reward_fn, init_rstate = self.reward_fn

        # total actual reward collected by MPC agent so far
        total_mpc_rew = np.zeros(self.mpc_batch)

        # keep track of states visited by MPC to initialize the explorer from
        all_inits = []

        # current state of mpc batche
        cur_state = np.array([init_state] * self.mpc_batch)

        # initialize the reward state for the mpc batch
        rstate = init_rstate(self.mpc_batch)

        # for visualization purposes
        mpc_preds = []
        mpc_actual = []
        mpc_span = []
        rstates = []

        # Perform MPC over max_horizon
        for T in range(self.max_horizon):
            print(T)

            # for goal visulization
            rstates.append(rstate)

            # rollout imaginary trajectories using state decoder
            rollouts = self.mpc(cur_state,
                                min(self.plan_horizon,
                                    self.max_horizon - T), self.mpc_explore,
                                self.mpc_explore_batch, reward_fn, rstate)

            # get first latent of best trajectory for each batch
            np_latents = rollouts[2][:, 0]

            # rollout the first latent in simulator
            mpc_traj = self.sampler_mpc.obtain_samples(self.mpc_batch *
                                                       self.max_path_length,
                                                       self.max_path_length,
                                                       np_to_var(np_latents),
                                                       reset_args=cur_state)

            # update reward and reward state based on trajectory from simulator
            mpc_rew, rstate = self.eval_rewards(mpc_traj['obs'], reward_fn,
                                                rstate)

            # for logging and visualization purposes
            futures = rollouts[0] + total_mpc_rew
            total_mpc_rew += mpc_rew
            mpc_preds.append(rollouts[1][0])
            mpc_span.append(rollouts[3])
            mpc_stats = {
                'mean futures': np.mean(futures),
                'std futures': np.std(futures),
                'mean actual': np.mean(total_mpc_rew),
                'std actual': np.std(total_mpc_rew),
            }
            mpc_actual.append(mpc_traj['obs'][0])
            with logger.prefix('itr #%d mpc step #%d | ' % (itr, T)):
                self.vae.print_diagnostics(mpc_stats)
            record_tabular(mpc_stats, 'mpc_stats.csv')

            # add current state to list of states explorer can initialize from
            all_inits.append(cur_state)

            # update current state to current state of simulator
            cur_state = mpc_traj['obs'][:, -1]

        # for visualization
        for idx, (actual, pred, rs, span) in enumerate(
                zip(mpc_actual, mpc_preds, rstates, mpc_span)):
            dataset.plot_pd_compare(
                [actual, pred, span[:100], span[:100, :dataset.path_len]],
                ['actual', 'pred', 'imagined', 'singlestep'],
                itr,
                save_dir='mpc_match',
                name='Pred' + str(idx),
                goals=goals,
                goalidx=rs[0])

        # compute reward at final state, for some tasks that care about final state reward
        final_reward, _ = reward_fn(cur_state, rstate)
        print(total_mpc_rew)
        print(final_reward)

        # randomly select states for explorer to explore
        start_states = np.concatenate(all_inits, axis=0)
        start_states = start_states[np.random.choice(
            start_states.shape[0],
            self.rand_per_mpc_step,
            replace=self.rand_per_mpc_step > start_states.shape[0])]

        # run the explorer from those states
        explore_len = ((self.max_path_length + 1) * self.mpc_explore_len) - 1
        self.policy_ex_algo.max_path_length = explore_len
        ex_trajs = self.sampler_ex.obtain_samples(start_states.shape[0] *
                                                  explore_len,
                                                  explore_len,
                                                  None,
                                                  reset_args=start_states)

        # Now concat actions taken by explorer with observations for adding to the dataset
        trajs = ex_trajs['obs']
        obs = trajs[:, -1]
        if hasattr(self.action_space,
                   'shape') and len(self.action_space.shape) > 0:
            acts = get_numpy(ex_trajs['actions'])
        else:
            # convert discrete actions into onehot
            act_idx = get_numpy(ex_trajs['actions'])
            acts = np.zeros(
                (trajs.shape[0], trajs.shape[1] - 1, dataset.action_dim))
            acts_reshape = acts.reshape((-1, dataset.action_dim))
            acts_reshape[range(acts_reshape.shape[0]),
                         act_idx.reshape(-1)] = 1.0

        # concat actions with obs
        acts = np.concatenate((acts, acts[:, -1:, :]), 1)
        trajacts = np.concatenate((ex_trajs['obs'], acts), axis=-1)
        trajacts = trajacts.reshape(
            (-1, self.max_path_length + 1, trajacts.shape[-1]))

        # compute train/val split
        ntrain = min(int(0.9 * trajacts.shape[0]),
                     dataset.buffer_size // self.add_frac)
        if dataset.n < dataset.batch_size and ntrain < dataset.batch_size:
            ntrain = dataset.batch_size
        nvalid = min(trajacts.shape[0] - ntrain,
                     test_dataset.buffer_size // self.add_frac)
        if test_dataset.n < test_dataset.batch_size and nvalid < test_dataset.batch_size:
            nvalid = test_dataset.batch_size

        print("Adding ", ntrain, ", Valid: ", nvalid)

        dataset.add_samples(trajacts[:ntrain].reshape((ntrain, -1)))
        test_dataset.add_samples(trajacts[-nvalid:].reshape((nvalid, -1)))

        # dummy dataset stores only data from this iteration
        dummy_dataset.clear()
        dummy_dataset.add_samples(trajacts[:-nvalid].reshape(
            (trajacts.shape[0] - nvalid, -1)))

        # compute negative ELBO on trajectories of explorer
        neg_elbos = []
        cur_batch = from_numpy(trajacts).float()
        for i in range(0, trajacts.shape[0], self.batch_size):
            mse, neg_ll, kl, bcloss, z_dist = self.vae.forward_batch(
                cur_batch[i:i + self.batch_size])
            neg_elbo = (get_numpy(neg_ll) + get_numpy(kl))
            neg_elbos.append(neg_elbo)

        # reward the explorer
        rewards = np.zeros_like(ex_trajs['rewards'])
        neg_elbos = np.concatenate(neg_elbos, axis=0)
        neg_elbos = neg_elbos.reshape((rewards.shape[0], -1))
        # just not on the first iteration, since VAE hasnt fitted yet
        if itr != 1:
            rewidx = list(
                range(self.max_path_length, explore_len,
                      self.max_path_length + 1)) + [explore_len - 1]
            for i in range(rewards.shape[0]):
                rewards[i, rewidx] = neg_elbos[i]

            # add in true reward to explorer if desired
            if self.true_reward_scale != 0:
                rstate = init_rstate(rewards.shape[0])
                for oidx in range(rewards.shape[1]):
                    r, rstate = reward_fn(ex_trajs['obs'][:, oidx], rstate)
                    rewards[:, oidx] += r * self.true_reward_scale

        ex_trajs['rewards'] = rewards

        # train explorer using PPO with neg elbo
        self.policy_ex_algo.process_samples(
            0, ex_trajs)  #, augment_obs=get_numpy(z))
        if itr != 1:
            self.policy_ex_algo.optimize_policy(0, ex_trajs)
        ex_trajs['stats']['MPC Actual'] = np.mean(total_mpc_rew)
        ex_trajs['stats']['Final Reward'] = np.mean(final_reward)

        # reset explorer if necessary
        if ex_trajs['stats']['Entropy'] < self.reset_ent:
            if hasattr(self.policy_ex, "prob_network"):
                self.policy_ex.prob_network.apply(xavier_init)
            else:
                self.policy_ex.apply(xavier_init)
                self.policy_ex.log_var_network.params_var.data = self.policy_ex.log_var_network.param_init

        # for visualization purposes
        colors = ['purple', 'magenta', 'green', 'black', 'yellow', 'black']
        fig, ax = plt.subplots(3, 2, figsize=(10, 10))
        for i in range(6):
            if i * 2 + 1 < obs.shape[1]:
                axx = ax[i // 2][i % 2]
                if i == 5:
                    axx.scatter(obs[:, -3], obs[:, -2], color=colors[i], s=10)
                else:
                    axx.scatter(obs[:, i * 2],
                                obs[:, i * 2 + 1],
                                color=colors[i],
                                s=10)
                axx.set_xlim(-3, 3)
                axx.set_ylim(-3, 3)
        path = logger.get_snapshot_dir() + '/final_dist'
        if not os.path.exists(path):
            os.makedirs(path)
        plt.savefig('%s/%d.png' % (path, itr))
        np.save(path + "/" + str(itr), obs)

        return ex_trajs['stats']