Ejemplo n.º 1
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_hyperparams(params)
Ejemplo n.º 2
0
def get_env(env_name, exp_name, seed):
    env = gym.make(env_name)

    set_global_seeds(seed)
    env.seed(seed)

    # Set Up Logger
    logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = osp.join('data', logdir)
    logdir = osp.join(logdir, '%d'%seed)
    logz.configure_output_dir(logdir)
    hyperparams = {'exp_name': exp_name, 'env_name': env_name}
    logz.save_hyperparams(hyperparams)

    expt_dir = '/tmp/hw3_vid_dir/'
    env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False)
    

    return env
Ejemplo n.º 3
0
def get_env(env_name, exp_name, seed):
    env = gym.make(env_name)

    set_global_seeds(seed)
    env.seed(seed)

    # Set Up Logger
    logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    logdir = osp.join('data', logdir)
    logdir = osp.join(logdir, '%d' % seed)
    logz.configure_output_dir(logdir)
    hyperparams = {'exp_name': exp_name, 'env_name': env_name}
    logz.save_hyperparams(hyperparams)

    expt_dir = '/tmp/hw3_vid_dir2/'
    env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
    env = wrap_deepmind(env)
    # observation = env.reset()
    # print('observation shape', observation.shape)

    return env
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, reward_to_go, animate, logdir,
             normalize_advantages, nn_baseline, seed, n_layers, size):
    start = time.time()
    #==================
    #SETUP LOGGER
    #===================
    locals_ = locals()
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    hyperparams = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_hyperparams(hyperparams)

    #==================
    #SETUP ENV
    #===================
    #Make gym env
    env = gym.make(env_name)
    #Set random seeds (TORCH, NUMPY and ENVIRONMENT)
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    #Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps
    #Find out if env is continous or discrete
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    #Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    #==================
    #INITIALIZE AGENT
    #===================
    neural_network_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
    }
    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }
    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }
    agent = Agent(neural_network_args, sample_trajectory_args,
                  estimate_return_args)

    #==================
    #TRAINING LOOP
    #===================
    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        with torch.no_grad():
            #Step 1: Sample Trajectories from current policy (neural network)
            paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch
        #Step 2: Calculate the RETURNS (Q_val, Adv) for this batch (batch_size = sum of all timesteps across all paths)
        ob_no = np.concatenate([path["observation"]
                                for path in paths])  #(batch_size * obs_dim)
        ac_na = np.concatenate([path["action"]
                                for path in paths])  #(batch_size * action_dim)
        re_n = [
            path["reward"] for path in paths
        ]  #(num_paths) each index is a numpy array containing the rewards for that path
        with torch.no_grad():
            q_n, adv_n = agent.estimate_return(ob_no, re_n)
        #Step 3: Update parameters using Policy Gradient
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [len(path["reward"]) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.save_pytorch_model(agent)