Exemple #1
0
    def train_step(self, N,
                   env=None,
                   sample_mode='trajectories',
                   horizon=1e6,
                   gamma=0.995,
                   gae_lambda=0.97,
                   num_cpu='max',
                   env_kwargs=None,
                   ):

        # Clean up input arguments
        env = self.env.env_id if env is None else env
        if sample_mode != 'trajectories' and sample_mode != 'samples':
            print("sample_mode in NPG must be either 'trajectories' or 'samples'")
            quit()

        ts = timer.time()

        if sample_mode == 'trajectories':
            input_dict = dict(num_traj=N, env=env, policy=self.policy, horizon=horizon,
                              base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs)
            paths = trajectory_sampler.sample_paths(**input_dict)
        elif sample_mode == 'samples':
            input_dict = dict(num_samples=N, env=env, policy=self.policy, horizon=horizon,
                              base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs)
            paths = trajectory_sampler.sample_data_batch(**input_dict)

        if self.save_logs:
            self.logger.log_kv('time_sampling', timer.time() - ts)

        self.seed = self.seed + N if self.seed is not None else self.seed

        # compute returns
        process_samples.compute_returns(paths, gamma)
        # compute advantages
        process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda)
        # train from paths
        eval_statistics = self.train_from_paths(paths)
        eval_statistics.append(N)
        # log number of samples
        if self.save_logs:
            num_samples = np.sum([p["rewards"].shape[0] for p in paths])
            self.logger.log_kv('num_samples', num_samples)
        # fit baseline
        if self.save_logs:
            ts = timer.time()
            error_before, error_after = self.baseline.fit(paths, return_errors=True)
            self.logger.log_kv('time_VF', timer.time()-ts)
            self.logger.log_kv('VF_error_before', error_before)
            self.logger.log_kv('VF_error_after', error_after)
        else:
            self.baseline.fit(paths)

        return eval_statistics
Exemple #2
0
def train_agent(
        job_name,
        agent,
        seed=0,
        niter=101,
        gamma=0.995,
        gae_lambda=None,
        num_cpu=1,
        sample_mode='trajectories',
        num_traj=50,
        num_samples=50000,  # has precedence, used with sample_mode = 'samples'
        save_freq=10,
        evaluation_rollouts=None,
        plot_keys=None,
        irl_kwargs=None,
        env_kwargs=None,
        temperature_decay=0.95,
        temperature_min=0,
        temperature_max=0,
        training_folder='Runs',
        should_fresh_start=False,
        run_no=None,
        fixed_evaluation_init_states=False):

    np.random.seed(seed)
    print("Job name:", job_name)
    training_path = os.path.join(training_folder, job_name)
    if plot_keys is None:
        plot_keys = ['stoc_pol_mean']
    if run_no is not None:
        training_path = check_run_folders(training_path, run_no)
    if not os.path.isdir(training_path):
        os.makedirs(training_path)
    previous_dir = os.getcwd()
    os.chdir(
        training_path)  # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True:
        os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    mean_evaluation_pol_performance = 0.0
    if isinstance(env_kwargs, dict):
        e = GymEnv(agent.env.env_id, **env_kwargs)
    else:
        e = GymEnv(agent.env.env_id)

    i_start = _load_latest_policy_and_logs(
        agent,
        policy_dir='iterations',
        logs_dir='logs',
        should_fresh_start=should_fresh_start)
    train_curve = agent.global_status['best_perf'] * np.ones(niter)

    def save_progress():
        if agent.save_logs:
            agent.logger.save_log('logs/')
            make_train_plots(log=agent.logger.log,
                             keys=plot_keys,
                             save_loc='logs/')
        checkpoint_file = 'checkpoint_%i.pickle' % i
        pickle.dump(agent.checkpoint,
                    open('iterations/' + checkpoint_file, 'wb'))
        # check if agent has custom save_checkpoint function defined, if so use it
        save_checkpoint_funct = getattr(agent, "save_checkpoint", None)
        if save_checkpoint_funct:
            save_checkpoint_funct(path='iterations/', iteration=i)
        pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))

    if i_start:
        print("Resuming from an existing job folder ...")

    for i in range(i_start, niter):
        print(
            "......................................................................................"
        )
        if run_no is not None:
            print("ITERATION : %i, RUN : %i " % (i, run_no))
        else:
            print("ITERATION : %i " % i)

        new_temperature = (temperature_max - temperature_min) * (
            temperature_decay**i) + temperature_min
        if new_temperature < 0 or temperature_max == 0:
            new_temperature = 0
        agent.policy.set_temperature(new_temperature)
        if agent.save_logs:
            agent.logger.log_kv('temperature', new_temperature)
        if train_curve[i - 1] > agent.global_status['best_perf']:
            best_policy = copy.deepcopy(agent.policy)
            agent.global_status['best_perf'] = train_curve[i - 1]

        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N,
                    itr=i,
                    sample_mode=sample_mode,
                    gamma=gamma,
                    gae_lambda=gae_lambda,
                    num_cpu=num_cpu,
                    env_kwargs=env_kwargs)
        # calculate no. of policy updates (used for IRL)
        policy_updates_count = calculate_policy_update_count(i, irl_kwargs)
        if irl_kwargs is not None:
            args['return_paths'] = True
        sampler_paths = []
        # do policy update
        for j in range(policy_updates_count):
            output = agent.train_step(**args)
            if isinstance(output, tuple):
                sampler_paths.extend(output[1])
                stats = output[0]
            else:
                stats = output
            if j == 0:
                train_curve[i] = stats[0]
            else:
                train_curve[i] = train_curve[i] + (1 / (1 + j) *
                                                   (stats[0] - train_curve[i]))

        if agent.save_logs:
            agent.logger.log_kv('iteration', i)

        # IRL discriminator update
        if irl_kwargs is not None:
            agent.fit_irl(sampler_paths,
                          main_loop_step=i,
                          main_loop_percentage=i / niter,
                          num_cpu=num_cpu,
                          policy_updates_count=policy_updates_count)

        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths(
                num_traj=evaluation_rollouts,
                policy=agent.policy,
                num_cpu=num_cpu,
                env=e.env_id,
                eval_mode=True,
                base_seed=seed,
                env_kwargs=env_kwargs,
                fixed_init_states=fixed_evaluation_init_states)
            if hasattr(agent, "irl_model"):
                eval_paths = agent.eval_irl(eval_paths,
                                            training_paths_from_policy=False)
            mean_evaluation_pol_performance = np.mean(
                [np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score',
                                    mean_evaluation_pol_performance)
                eval_success_rate = e.env.evaluate_success(eval_paths)
                agent.logger.log_kv('eval_success_rate', eval_success_rate)

        if agent.save_logs:
            agent.logger.align_rows()

        if i % save_freq == 0 and i > 0:
            save_progress()

        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write(
                "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " %
              (timer.asctime(timer.localtime(timer.time())), i, train_curve[i],
               mean_evaluation_pol_performance,
               agent.global_status['best_perf']))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" %
                          (i, train_curve[i], mean_evaluation_pol_performance,
                           agent.global_status['best_perf']))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(
                filter(lambda v: np.asarray(v[1]).size == 1,
                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    if i_start < niter:
        save_progress()
    else:
        print(
            "Requested iteration number equal to the found checkpoint iteration count. All done, exiting."
        )

    os.chdir(previous_dir)
Exemple #3
0
    seed=SEED,
    # hvp_sample_frac=job_data['hvp_frac'],
    normalized_step_size=job_data['step_size'],
    save_logs=True)
paths = []

for outer_iter in range(job_data['num_iter']):

    ts = timer.time()
    print("================> ITERATION : %i " % outer_iter)
    print("Getting interaction data from real dynamics ...")

    if outer_iter == 0:
        iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'],
                                                     agent.env,
                                                     agent.policy,
                                                     eval_mode=False,
                                                     base_seed=SEED)
    else:
        iter_paths = sample_paths(num_traj=job_data['paths_per_iter'],
                                  env=agent.env,
                                  policy=agent.policy,
                                  eval_mode=False,
                                  base_seed=SEED + outer_iter)

    # reset the environment (good for hardware)
    e.reset()

    for p in iter_paths:
        paths.append(p)
Exemple #4
0
def train_agent(
    job_name,
    agent,
    seed=0,
    niter=101,
    gamma=0.995,
    gae_lambda=None,
    num_cpu=1,
    sample_mode='trajectories',
    num_traj=50,
    num_samples=50000,  # has precedence, used with sample_mode = 'samples'
    save_freq=10,
    evaluation_rollouts=None,
    plot_keys=['stoc_pol_mean'],
):

    np.random.seed(seed)
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name)  # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True:
        os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf * np.ones(niter)
    mean_pol_perf = 0.0
    e = GymEnv(agent.env.env_id)

    for i in range(niter):
        print(
            "......................................................................................"
        )
        print("ITERATION : %i " % i)

        if train_curve[i - 1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i - 1]

        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N,
                    sample_mode=sample_mode,
                    gamma=gamma,
                    gae_lambda=gae_lambda,
                    num_cpu=num_cpu)
        stats = agent.train_step(**args)
        train_curve[i] = stats[0]

        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths(num_traj=evaluation_rollouts,
                                      policy=agent.policy,
                                      num_cpu=num_cpu,
                                      env=e.env_id,
                                      eval_mode=True,
                                      base_seed=seed)
            mean_pol_perf = np.mean(
                [np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)

        if i % save_freq == 0 and i > 0:
            if agent.save_logs:
                agent.logger.save_log('logs/')
                make_train_plots(log=agent.logger.log,
                                 keys=plot_keys,
                                 save_loc='logs/')
            policy_file = 'policy_%i.pickle' % i
            baseline_file = 'baseline_%i.pickle' % i
            pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
            pickle.dump(agent.baseline,
                        open('iterations/' + baseline_file, 'wb'))
            pickle.dump(best_policy, open('iterations/best_policy.pickle',
                                          'wb'))

        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write(
                "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " %
              (timer.asctime(timer.localtime(
                  timer.time())), i, train_curve[i], mean_pol_perf, best_perf))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" %
                          (i, train_curve[i], mean_pol_perf, best_perf))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(
                filter(lambda v: np.asarray(v[1]).size == 1,
                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
    if agent.save_logs:
        agent.logger.save_log('logs/')
        make_train_plots(log=agent.logger.log,
                         keys=plot_keys,
                         save_loc='logs/')
    os.chdir(previous_dir)
Exemple #5
0
            save_freq=5,
            evaluation_rollouts=None)
print("========================================")
print("Expert policy training complete !!!")
print("========================================")
print("time taken = %f" % (timer.time() - ts))
print("========================================")

# ------------------------------
# Get demonstrations
print("========================================")
print("Collecting expert demonstrations")
print("========================================")
expert_pol = pickle.load(
    open('swimmer_exp1/iterations/best_policy.pickle', 'rb'))
demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id)

# ------------------------------
# Train BC
policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED)
bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64,
              lr=1e-3)  # will use Adam by default
ts = timer.time()
print("========================================")
print("Running BC with expert demonstrations")
print("========================================")
bc_agent.train()
print("========================================")
print("BC training complete !!!")
print("time taken = %f" % (timer.time() - ts))
print("========================================")
Exemple #6
0
    def train_step(
        self,
        N,
        itr,
        env=None,
        sample_mode='trajectories',
        horizon=1e6,
        gamma=0.995,
        gae_lambda=0.97,
        num_cpu='max',
        env_kwargs=None,
        return_paths=False,
    ):

        # Clean up input arguments
        env = self.env.env_id if env is None else env
        if sample_mode != 'trajectories' and sample_mode != 'samples':
            print(
                "sample_mode in NPG must be either 'trajectories' or 'samples'"
            )
            quit()

        ts = timer.time()

        if sample_mode == 'trajectories':
            input_dict = dict(num_traj=N,
                              env=env,
                              policy=self.policy,
                              horizon=horizon,
                              base_seed=self.seed,
                              num_cpu=num_cpu,
                              env_kwargs=env_kwargs)
            paths = trajectory_sampler.sample_paths(**input_dict)
            if self.augmentation is not None:
                paths = self.augmentation.augment_paths(
                    paths,
                    num_cpu=num_cpu,
                    augment_times=self.direct_learning_augment_samples_count)
        elif sample_mode == 'samples':
            input_dict = dict(num_samples=N,
                              env=env,
                              policy=self.policy,
                              horizon=horizon,
                              base_seed=self.seed,
                              num_cpu=num_cpu,
                              env_kwargs=env_kwargs)
            paths = trajectory_sampler.sample_data_batch(**input_dict)
        else:
            raise ValueError(
                "sample_mode has to be either trajectories or samples, given:",
                sample_mode)

        if self.save_logs:
            self.logger.log_kv('time_sampling', timer.time() - ts)

        self.seed = self.seed + N if self.seed is not None else self.seed

        if return_paths:
            original_paths = paths.copy()
        if self.dump_paths:
            self.fusion.save_itr_paths(itr=itr, paths=paths)
        if hasattr(self, "irl_model"):
            paths = self.eval_irl(paths)
            if hasattr(self, "demo_paths") and self.demo_paths is not None:
                self.demo_paths = self.eval_irl(
                    self.demo_paths, training_paths_from_policy=False)
        # compute returns
        process_samples.compute_returns(paths, gamma)
        # compute advantages
        process_samples.compute_advantages(paths, self.baseline, gamma,
                                           gae_lambda)
        # train from paths
        eval_statistics = self.train_from_paths(paths)
        eval_statistics.append(N)
        # log number of samples
        if self.save_logs:
            num_samples = np.sum([p["rewards"].shape[0] for p in paths])
            self.logger.log_kv('num_samples', num_samples)
        # fit baseline
        if self.save_logs:
            ts = timer.time()
            error_before, error_after = self.baseline.fit(paths,
                                                          return_errors=True)
            self.logger.log_kv('time_VF', timer.time() - ts)
            self.logger.log_kv('VF_error_before', error_before)
            self.logger.log_kv('VF_error_after', error_after)
        else:
            self.baseline.fit(paths)

        if return_paths:
            return eval_statistics, original_paths
        else:
            return eval_statistics