Ejemplo n.º 1
0
 def save_progress():
     if agent.save_logs:
         agent.logger.save_log('logs/')
         make_train_plots(log=agent.logger.log,
                          keys=plot_keys,
                          save_loc='logs/')
     checkpoint_file = 'checkpoint_%i.pickle' % i
     pickle.dump(agent.checkpoint,
                 open('iterations/' + checkpoint_file, 'wb'))
     # check if agent has custom save_checkpoint function defined, if so use it
     save_checkpoint_funct = getattr(agent, "save_checkpoint", None)
     if save_checkpoint_funct:
         save_checkpoint_funct(path='iterations/', iteration=i)
     pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
Ejemplo n.º 2
0
def train_agent(
    job_name,
    agent,
    seed=0,
    niter=101,
    gamma=0.995,
    gae_lambda=None,
    num_cpu=1,
    sample_mode='trajectories',
    num_traj=50,
    num_samples=50000,  # has precedence, used with sample_mode = 'samples'
    save_freq=10,
    evaluation_rollouts=None,
    plot_keys=['stoc_pol_mean'],
):

    np.random.seed(seed)
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name)  # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True:
        os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf * np.ones(niter)
    mean_pol_perf = 0.0
    e = GymEnv(agent.env.env_id)

    for i in range(niter):
        print(
            "......................................................................................"
        )
        print("ITERATION : %i " % i)
        if train_curve[i - 1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i - 1]
        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N,
                    sample_mode=sample_mode,
                    gamma=gamma,
                    gae_lambda=gae_lambda,
                    num_cpu=num_cpu)
        stats = agent.train_step(**args)
        train_curve[i] = stats[0]
        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths_parallel(N=evaluation_rollouts,
                                               policy=agent.policy,
                                               num_cpu=num_cpu,
                                               env_name=e.env_id,
                                               mode='evaluation',
                                               pegasus_seed=seed)
            mean_pol_perf = np.mean(
                [np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)
        if i % save_freq == 0 and i > 0:
            if agent.save_logs:
                agent.logger.save_log('logs/')
                make_train_plots(log=agent.logger.log,
                                 keys=plot_keys,
                                 save_loc='logs/')
            policy_file = 'policy_%i.pickle' % i
            baseline_file = 'baseline_%i.pickle' % i
            pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
            pickle.dump(agent.baseline,
                        open('iterations/' + baseline_file, 'wb'))
            pickle.dump(best_policy, open('iterations/best_policy.pickle',
                                          'wb'))
        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write(
                "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " %
              (timer.asctime(timer.localtime(
                  timer.time())), i, train_curve[i], mean_pol_perf, best_perf))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" %
                          (i, train_curve[i], mean_pol_perf, best_perf))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(
                filter(lambda v: np.asarray(v[1]).size == 1,
                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
    if agent.save_logs:
        agent.logger.save_log('logs/')
        make_train_plots(log=agent.logger.log,
                         keys=plot_keys,
                         save_loc='logs/')
    os.chdir(previous_dir)
Ejemplo n.º 3
0
    if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0:
        # convert to CPU before pickling
        agent.to('cpu')
        pickle.dump(
            agent, open(OUT_DIR + '/agent_' + str(outer_iter) + '.pickle',
                        'wb'))
        pickle.dump(
            policy,
            open(OUT_DIR + '/policy_' + str(outer_iter) + '.pickle', 'wb'))
        agent.to(job_data['device'])
        if job_data['device_path'] is not None:
            pickle.dump(
                exp_data,
                open(OUT_DIR + '/data_' + str(outer_iter) + '.pickle', 'wb'))

    tf = timer.time()
    logger.log_kv('iter_time', tf - ts)
    print_data = sorted(
        filter(lambda v: np.asarray(v[1]).size == 1,
               logger.get_current_log().items()))
    print(tabulate(print_data))
    logger.save_log(OUT_DIR + '/')
    make_train_plots(
        log=logger.log,
        keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'],
        save_loc=OUT_DIR + '/')

# final save
pickle.dump(agent, open(OUT_DIR + '/agent_final.pickle', 'wb'))
pickle.dump(policy, open(OUT_DIR + '/policy_final.pickle', 'wb'))
Ejemplo n.º 4
0
def train_agent(job_name, agent,
                seed = 0,
                niter = 101,
                gamma = 0.995,
                gae_lambda = None,
                num_cpu = 1,
                sample_mode = 'trajectories',
                num_traj = 50,
                num_samples = 50000, # has precedence, used with sample_mode = 'samples'
                save_freq = 10,
                evaluation_rollouts = None,
                plot_keys = ['stoc_pol_mean'],
                ):

    np.random.seed(seed)
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name) # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf*np.ones(niter)
    mean_pol_perf = 0.0
    e = GymEnv(agent.env.env_id)

    for i in range(niter):
        print("......................................................................................")
        print("ITERATION : %i " % i)
        if train_curve[i-1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i-1]
        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu)
        stats = agent.train_step(**args)
        train_curve[i] = stats[0]
        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths_parallel(N=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu,
                                               env_name=e.env_id, mode='evaluation', pegasus_seed=seed)
            mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)
        if i % save_freq == 0 and i > 0:
            if agent.save_logs:
                agent.logger.save_log('logs/')
                make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
            policy_file = 'policy_%i.pickle' % i
            baseline_file = 'baseline_%i.pickle' % i
            pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
            pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb'))
            pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())),
                                                 i, train_curve[i], mean_pol_perf, best_perf))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
                                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
    if agent.save_logs:
        agent.logger.save_log('logs/')
        make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
    os.chdir(previous_dir)
Ejemplo n.º 5
0
def train_loop(job_name,
               agent,
               save_dir,
               seed=0,
               niter=101,
               gamma=0.995,
               gae_lambda=None,
               num_cpu=1,
               sample_mode='trajectories',
               num_samples=None,
               save_freq=10,
               evaluation_rollouts=None,
               plot_keys=['stoc_pol_mean']):
    """Trains the given agent and saves the resultant policies.

    Args:
        job_name: The title of the job.
        agent: The MJRL agent to train.
        save_dir: The directory to save the trained policies and logs to.
        seed: The seed for np.random.
        niter: The number of iterations.
        gamma: Discount factor.
        gae_lambda: Multiplier to the discount factor when calculating advantages.
        num_cpu: Number of CPUs used to perform the train step and sampling
        sample_mode: One of 'trajectories' or 'samples'.
        num_samples: The number of samples. This is the number of trajectories
            in trajectory sampling mode, or the number of samples in the batch
            otherwise.
        save_freq: The frequency of iterations when the policy is saved.
        evaluation_rollouts: The number of evaluation rollouts to perform per iteration.
        plot_keys: The keys plotted on the training plot.
    """
    # Validate parameters.
    if not os.path.isdir(save_dir):
        raise ValueError('Save directory {} does not exist'.format(save_dir))
    if sample_mode not in ['trajectories', 'samples']:
        raise ValueError('Invalid sample mode: {}'.format(sample_mode))

    # Choose a default for num_samples if not specified.
    if num_samples is None:
        num_samples = 50 if sample_mode == 'trajectories' else 50000

    # Initialize the folders in the save directory.
    iterations_dir = os.path.join(save_dir, 'iterations')
    if not os.path.isdir(iterations_dir):
        os.mkdir(iterations_dir)
    logs_dir = os.path.join(save_dir, 'logs')
    if agent.save_logs and not os.path.isdir(logs_dir):
        os.mkdir(logs_dir)

    # Initialize results log file.
    results_path = os.path.join(save_dir, 'results.txt')
    open(results_path, 'w').close()

    # Initialize training variables.
    np.random.seed(seed)
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf * np.ones(niter)
    mean_pol_perf = 0.0

    # Prefix tensorboard logs with the job name.
    # tb_logger = tensorboard.get_prefixed(job_name)
    tb_logger = []
    # print('Starting training for job: {}'.format(job_name))

    for i in range(niter):
        print('.' * 80 + '\nITERATION : {}'.format(i))

        if train_curve[i - 1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i - 1]

        stats = agent.train_step(
            N=num_samples,
            sample_mode=sample_mode,
            gamma=gamma,
            gae_lambda=gae_lambda,
            num_cpu=num_cpu,
        )
        train_curve[i] = stats[0]

        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print('Performing evaluation rollouts ........')
            mean_pol_perf = _evaluation_rollout(agent, evaluation_rollouts,
                                                num_cpu)
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)

        if i % save_freq == 0 and i > 0:
            _save_policy(agent.policy, 'policy_{}'.format(i), iterations_dir)
            _save_policy(agent.baseline, 'baseline_{}'.format(i),
                         iterations_dir)
            _save_policy(best_policy, 'best_policy', iterations_dir)
            if agent.save_logs:
                agent.logger.save_log(logs_dir)
                make_train_plots(log=agent.logger.log,
                                 keys=plot_keys,
                                 save_loc=logs_dir)

        _log_performance(i, train_curve[i], mean_pol_perf, best_perf,
                         results_path, tb_logger)
        if agent.save_logs:
            print_data = sorted(
                filter(lambda v: np.asarray(v[1]).size == 1,
                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # Save the final best policy.
    _save_policy(best_policy, 'best_policy', iterations_dir)
    if agent.save_logs:
        agent.logger.save_log(logs_dir)
        make_train_plots(log=agent.logger.log,
                         keys=plot_keys,
                         save_loc=logs_dir)
Ejemplo n.º 6
0
        pickle.dump(
            agent,
            open(OUT_DIR + '/iterations/agent_' + str(outer_iter) + '.pickle',
                 'wb'))
        pickle.dump(
            policy,
            open(OUT_DIR + '/iterations/policy_' + str(outer_iter) + '.pickle',
                 'wb'))
        pickle.dump(best_policy,
                    open(OUT_DIR + '/iterations/best_policy.pickle', 'wb'))
        agent.to(job_data['device'])

    tf = timer.time()
    logger.log_kv('eval_log_time', tf - t3)
    logger.log_kv('iter_time', tf - ts)
    print_data = sorted(
        filter(lambda v: np.asarray(v[1]).size == 1,
               logger.get_current_log().items()))
    print(tabulate(print_data))
    logger.save_log(OUT_DIR + '/logs')
    make_train_plots(
        log=logger.log,
        keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'],
        x_scale=float(job_data['act_repeat']),
        y_scale=1.0,
        save_loc=OUT_DIR + '/logs/')

# final save
pickle.dump(agent, open(OUT_DIR + '/iterations/agent_final.pickle', 'wb'))
pickle.dump(policy, open(OUT_DIR + '/iterations/policy_final.pickle', 'wb'))