Exemple #1
0
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
    set_global_seeds(seed)


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", np.nan if len(rewards) == 0 else np.mean(rewards))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
def learn(
    network,
    env,
    save_path,
    seed=None,
    nsteps=10,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    # log_interval=100,
    log_interval=10,
    load_path=None,
    **network_kwargs):


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True
    set_global_seeds(seed)
    assert save_path is not None
    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)

    # Calculate the batch_size,这里将nsteps设为1
    nbatch = nenvs*nsteps

    observation = []
    action = []
    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values,output = runner.run()
        observation.append(obs)
        print('times', update)
    obs = np.concatenate(observation)

    #compute fisher matrix
    FM = model.compute_fisher(obs,plot_diffs=True,disp_freq=10)
    # FM = model.compute_exact_fisher(obs,plot_diffs=True,disp_freq=10)

    joblib.dump(FM, save_path)
Exemple #3
0
def learn(
    network,
    env,
    seed=None,
    nsteps=5,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    log_interval=100,
    load_path=None,
    **network_kwargs):

    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)

    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    alpha:              float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''



    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs*nsteps

    # Start total timer
    tstart = time.time()

    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart

        # Calculate the fps (frame per second)
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
Exemple #4
0
def learn(network,
          env,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=None,
          lrschedule='linear',
          load_path=None,
          is_async=True,
          **network_kwargs):
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    if isinstance(network, str):
        network_type = network
        policy_network_fn = get_network_builder(network_type)(**network_kwargs)
        policy = policy_network_fn(ob_space.shape)

    model = Model(policy,
                  ob_space,
                  ac_space,
                  nenvs,
                  total_timesteps,
                  nprocs=nprocs,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  vf_fisher_coef=vf_fisher_coef,
                  lr=lr,
                  max_grad_norm=max_grad_norm,
                  kfac_clip=kfac_clip,
                  lrschedule=lrschedule,
                  is_async=is_async)

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        # TODO: q_runner
        enqueue_threads = model.q_runner.create_threads(model.sess,
                                                        coord=coord,
                                                        start=True)
    else:
        enqueue_threads = []

    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        obs = tf.constant(obs)
        if states is not None:
            states = tf.constant(states)
        rewards = tf.constant(rewards)
        masks = tf.constant(masks)
        actions = tf.constant(actions)
        values = tf.constant(values)

        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Exemple #5
0
def learn(network,
          env,
          save_path,
          seed=None,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=None,
          lrschedule='linear',
          load_path=None,
          is_async=False,
          **network_kwargs):
    set_global_seeds(seed)

    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs

    model = Model(policy,
                  nenvs,
                  total_timesteps,
                  nprocs=nprocs,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  vf_fisher_coef=vf_fisher_coef,
                  lr=lr,
                  max_grad_norm=max_grad_norm,
                  kfac_clip=kfac_clip,
                  lrschedule=lrschedule,
                  is_async=is_async)

    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)

    # Calculate the batch_size,这里将nsteps设为1
    nbatch = nenvs * nsteps
    print(nbatch)
    tstart = time.time()
    F = []

    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, output = runner.run()
        fisher = model.compute_fisher(obs)
        # f = []
        # l = len(obs)
        # efficient = 0.6
        # weight = np.logspace(l,1,l,base=efficient)
        #
        # for index in range(l):
        #     observation = obs[index]
        #     fisher = model.compute_fisher(observation)
        #     for i,j in enumerate(fisher):
        #         if index == 0:
        #             f.append(weight[index] * fisher[j])
        #         else:
        #             f[i]+=weight[index] * fisher[j]
        # for i in range(len(f)):
        #     f[i] = f[i]/np.sum(weight)
        #
        model.old_obs = obs
        nseconds = time.time() - tstart
        print(update)
        #
        # if update == 1:
        #     for x in f:
        #         F.append(x)
        # else:
        #     for x in range(len(f)):
        #         F[x] += f[x]

        if update == 1:
            for i in fisher:
                F.append(fisher[i])
        else:
            for i, j in enumerate(fisher):
                F[i] += fisher[j]
    for i in range(len(F)):
        F[i] /= total_timesteps
    joblib.dump(F, 'fisher_matrix/simple_agent_random_4000')

    return model
Exemple #6
0
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):
    set_global_seeds(seed)


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule, is_async=is_async)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
    else:
        enqueue_threads = []

    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Exemple #7
0
            else:
                actions, rew, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(actions)
            episode_rew += rew[0] if isinstance(env, VecEnv) else rew
            episode_rew2 += rew[1] if isinstance(env, VecEnv) else rew
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done
            if done:
                print('episode_rew={}, \t{}'.format(episode_rew, episode_rew2))
                episode_rew = 0
                episode_rew2 = 0
                obs = env.reset()

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # Start total timer
    tstart = time.time()
    last_rewards = []

    graph_names = ('policy_entropy', 'value_loss', 'policy_loss',
                   'values_mean', 'explained_variance', 'rewards_mean',
                   'rewards_min', 'rewards_max', 'rewards_median',
                   'rewards_std', 'values_mean', 'values_min', 'values_max',
                   'values_median', 'values_std')
    graph_data = {k: [] for k in graph_names}
Exemple #8
0
def learn(network,
          env,
          seed=None,
          nsteps=5,
          noptions=64,
          top_n_options=8,
          replay_buffer_size=1000,
          total_timesteps=int(80e6),
          start_op_at=0.8,
          options_update_iter=10,
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          diverse_r_coef=0.1,
          alpha=0.99,
          gamma=0.99,
          log_interval=100,
          load_path=None,
          **network_kwargs):
    '''
    Main entrypoint for VFO algorithm. Train a policy with given network architecture on a given environment using vfo algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    noptions:           int, number of options for VFO, i.e. channels of last Conv layer

    top_n_options:      int, number of top possible options to for selective option step

    replay_buffer_size  int, size of replay buffer which is used to train options

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    start_op_at:        float, after trainign mf policy for `start_op_at * total_timesteps` steps, begin to train options policy

    options_update_iter: int, number of call for train_options per sample

    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)

    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    diverse_r_coef:     float, scaling factor for diversity reward when training option policy

    alpha:              float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.vfo/policies.py/build_policy and arguments to a particular type of network
    '''
    set_global_seeds(seed)

    nenvs = env.num_envs
    policy = build_policy(env, network, noptions, **network_kwargs)
    assert replay_buffer_size > 100, 'Replay buffer is too small'
    replay_buffer = Buffer(env, nsteps, size=replay_buffer_size)

    model = Model(policy=policy,
                  env=env,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  diverse_r_coef=diverse_r_coef,
                  gamma=gamma,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    options_runner = OptionsRunner(env,
                                   model,
                                   noptions,
                                   nsteps=nsteps,
                                   gamma=gamma,
                                   use_selective_option=True,
                                   top_n_options=top_n_options)

    nbatch = nenvs * nsteps
    tstart = time.time()
    to_train_options, init_replay_buffer_done = False, False
    total_updates = total_timesteps // nbatch + 1
    for update in range(1, total_updates):
        if update % 300 == 0:
            model.save(os.path.join(logger.get_dir(), "snapshot"))
        if not to_train_options:
            obs, states, rewards, masks, actions, values = runner.run()
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values)
            nseconds = time.time() - tstart
            fps = int((update * nbatch) / nseconds)
            if update % log_interval == 0 or update == 1:
                ev = explained_variance(values, rewards)
                logger.record_tabular("nupdates", update)
                logger.record_tabular("total_timesteps", update * nbatch)
                logger.record_tabular("fps", fps)
                logger.record_tabular("policy_entropy", float(policy_entropy))
                logger.record_tabular("value_loss", float(value_loss))
                logger.record_tabular("policy_loss", float(policy_loss))
                logger.record_tabular("explained_variance", float(ev))
                logger.dump_tabular()

            if update > total_updates * start_op_at:
                to_train_options = True
        else:
            obs, next_obs, states, next_states, masks, next_masks, actions, \
                actions_full, rewards, values, dones, options_z = \
                options_runner.run()
            replay_buffer.put(obs, next_obs, states, next_states, masks,
                              next_masks, actions, actions_full, dones,
                              options_z)

            options_runner.sample_option_z(prior=model.prior_op_z)

            if replay_buffer.num_in_buffer > 100:
                init_replay_buffer_done = True

            if not init_replay_buffer_done:
                logger.info('Sample data using option policy...')
                continue

            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values)

            for _ in range(options_update_iter):
                obs, next_obs, states, next_states, masks, next_masks, \
                    actions, actions_full, dones, options_z = \
                    replay_buffer.get()
                distillation_loss_value = model.distill_mf_to_options(
                    obs, states, masks)
                record_loss_values = model.train_options(
                    obs, next_obs, states, next_states, masks, next_masks,
                    actions, actions_full, dones, options_z)
                record_loss_values.append(
                    ('distillation_loss', distillation_loss_value))

            nseconds = time.time() - tstart
            fps = int((update * nbatch) / nseconds)
            if update % log_interval == 0 or update == 1:
                ev = explained_variance(values, rewards)
                logger.record_tabular("nupdates", update)
                logger.record_tabular("total_timesteps", update * nbatch)
                logger.record_tabular("fps", fps)
                logger.record_tabular("policy_entropy", float(policy_entropy))
                logger.record_tabular("value_loss", float(value_loss))
                logger.record_tabular("policy_loss", float(policy_loss))
                logger.record_tabular("explained_variance", float(ev))
                for loss_name, loss_value in record_loss_values:
                    logger.record_tabular(loss_name, loss_value)
                logger.dump_tabular()

    env.close()
    return model
Exemple #9
0
def learn(network,
          env,
          seed=None,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100,
          load_path=None,
          **network_kwargs):
    '''
    A2C 알고리즘에 대한 주 진입지점. `a2c` 알고리즘을 사용하여 주어진 환경에서 주어진 망으로 정책을 벼림한다.

    Parameters:
    -----------

    network:            정책망 구조. 표준망 구조를 지정하는 문자열(mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small
                        , conv_only - 전체목록을 보려면 baselines.common/models.py를 보라), 또는 입력으로 텐서플로우
                        텐서를 가지고 출력 텐서는 망 마지막단 출력쌍(output_tensor, extra_feed)을 반환하는 함수,
                        , extra_feed 는 feed-forward 를 위해서는 None 다, 그리고 extra_feed 는 재사용신경망을 위한
                        망으로 상태를 사비하는 방법을 설명하는 목록(dictionary)이다. 정책에서 재사용신경망 사용에 대한
                        자세한 내용은 baselines.common/policies.py/lstm 을 보라.

    env:                강화각흡 환경. VecEnv(baselines.common/vec_env)와 비슷한 전달기를 구현하거나
                        DummyVecEnv(baselines.common/vec_env/dummy_vec_env.py)로 싸야 한다.

    seed:               알고리즘에서 뿌림수 순서를 복제하기 위한 씨알이다. 기본적으로 None 이다, 이것은 씨스템
                        노이즈생성기가 씨알임을 의미한다(복제하지 않는다)

    nsteps:             int, 환경을 배열의 보수 마다 갱신한다(즉, 사리수(batch size)는 nsteps * nenv 이다 여기에서
                        nenv 는 병렬로 모사한 환경을 복사한 개수다.)

    total_timesteps:    int, 벼림하기 위한 총 보수 (기본값: 80M)

    vf_coef:            float, 총손실 함수에서 가치함수 손실 앞의 계수 (기본값: 0.5)

    ent_coef:           float, 총손실 함수에서 정책 엔트로피 앞의 계수 (기본값: 0.01)

    max_gradient_norm:  float, 기울기(gradient)는 전역(global) L2 보다 크지않은 값으로 제한(clipped)한다 (기본값: 0.5)

    lr:                 float, RMSProp 을 위한 벼림비(현재 구현은 RMSProp 에서 강제(hardcoded)한다) (기본값: 7e-4)

    lrschedule:         벼림비 계획. 'linear', 'constant', 또는 [0..1] -> [0..1] 함수로 할수 있다, 이것은 벼림진행의
                        일부를 입력으로 취하여 출력으로 벼림비(lr 로 지정) 부분을 반환한다.

    epsilon:            float, RMSProp epsilon (RMSProp 갱신 분모로 제곱근 계산을 정상화 한다) (기본값: 1e-5)

    alpha:              float, RMSProp 에누리 참여값(decay parameter) (기본값: 0.99)

    gamma:              float, 포상 에누리 참여값(reward discounting parameter) (기본값: 0.99)

    log_interval:       int, 얼마나 자주 기록을 인쇄하는지 지정한다 (기본값: 100)

    **network_kwargs:   정책/망 작성기에 대한 열쇄글 결정고유값(arguments). baselines.common/policies.py/build_policy와
                        망의 특정 유형에 대한 결정고유값(arguments)을 봐라. 예를들어, 'mlp' 망 구조는 num_hidden 와
                        num_layers 의 결정고유값(arguments)을 가진다.

    '''

    set_global_seeds(seed)

    # 환경의 개수를 가져온다(Get the nb of env)
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # 모형개체 대리자 (step_model(표집모형) 와 train_model(벼림모형)을 생성한다)
    model = Model(policy=policy,
                  env=env,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule)

    if load_path is not None:
        model.load(load_path)

    # 실행개체 대리자(Instantiate the runner object)
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # 사리수(batch_size) 계산
    nbatch = nenvs * nsteps

    # 전체타이머 시작
    tstart = time.time()

    for update in range(1, total_timesteps // nbatch + 1):
        # 경험의 작은 덩이를 가져온다. Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        nseconds = time.time() - tstart

        # fps 계산 (frame per second)
        fps = int((update * nbatch) / nseconds)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

    return model
def learn(network,
          env,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=100,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          eval_env=None,
          save_interval=None,
          lrschedule='linear',
          load_path=None,
          is_async=True,
          augment=False,
          **network_kwargs):
    set_global_seeds(seed)

    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule,
                               is_async=is_async)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma, augment=augment)
    epinfobuf = deque(maxlen=100)

    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma)
        eval_epinfobuf = deque(maxlen=100)

    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess,
                                                        coord=coord,
                                                        start=True)
    else:
        enqueue_threads = []

    best_rew = float('-inf')
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_obs, eval_states, eval_returns, eval_masks, eval_actions, eval_values, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632
            eval_epinfobuf.extend(eval_epinfos)

        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.record_tabular(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.record_tabular(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.dump_tabular()
            if safemean([epinfo['r'] for epinfo in epinfobuf
                         ]) > best_rew and logger.get_dir():
                best_rew = safemean([epinfo['r'] for epinfo in epinfobuf])
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
                os.makedirs(checkdir, exist_ok=True)
                savepath = osp.join(checkdir, 'best.ckpt')
                print(f"Best model w/ rew {best_rew}. Saving to", savepath)
                model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Exemple #11
0
def learn(network,
          env,
          seed,
          env_id=None,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=100,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=None,
          save_path=None,
          lrschedule='linear',
          load_path=None,
          is_async=True,
          **network_kwargs):

    info_env = gym.make(env_id)
    algo = 'acktr'
    # wandb.init(project="floorplan_generator", name=algo)
    # wandb.config.algo = algo
    # # wandb.config.action_space = info_env.action_type
    # wandb.config.step_size = info_env.step_size
    #wandb.config.active_rewards = info_env.active_rewards
    #print("\n \n \n \n \n HI21 \n \n \n \n \n")
    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)
    #print("\n \n \n \n \n HI22 \n \n \n \n \n")

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule,
                               is_async=is_async)
    # if save_interval and logger.get_dir():
    #     import cloudpickle
    #     print(osp.join(logger.get_dir(), 'make_model.pkl'))
    #     with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb+') as fh:
    #         print(make_model)
    #         fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    #print("\n \n \n \n \n HI23 \n \n \n \n \n")

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess,
                                                        coord=coord,
                                                        start=True)
    else:
        enqueue_threads = []
    #print("\n \n \n \n \n HI24 \n \n \n \n \n")

    for update in range(1, total_timesteps // nbatch + 1):
        #print("step1")
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        #print("step2")
        epinfobuf.extend(epinfos)
        #print("step3")
        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        #print("step4")
        model.old_obs = obs
        #print("step5")
        nseconds = time.time() - tstart
        #print("step6")
        fps = int((update * nbatch) / nseconds)

        if update % log_interval == 0 or update == 1:
            # images = env.get_images()
            # image = images[0]
            # writer.add_image('imresult', image, update, dataformats='HWC')
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

            # wandb.log({'eprewmean': safemean([epinfo['r'] for epinfo in epinfobuf]),
            #         'eplenmean': safemean([epinfo['l'] for epinfo in epinfobuf])})

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            savepath = save_path
            print('Saving to', savepath)
            model.save(savepath)

    coord.request_stop()
    coord.join(enqueue_threads)
    return model