Ejemplo n.º 1
0
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1):
    print("Running Acer Simple")
    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes) # HACK
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()
    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()
Ejemplo n.º 2
0
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1):
    print("Running Acer Simple")
    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes) # HACK
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()
    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()
Ejemplo n.º 3
0
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):

    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model
Ejemplo n.º 4
0
def learn(policy, env, flags):
    """

    :param policy:
    :param baselines.common.vec_env.VecEnv env:
    :param baselines.acer.flags.AcerFlags flags:
    """
    print("Running Acer Simple")
    print(flags)

    flags.total_timesteps = int(flags.total_timesteps)

    # disable gpu before creating any tensor
    if not flags.use_gpu:
        tf_util.disable_gpu()

    tf.reset_default_graph()
    set_global_seeds(flags.seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, num_procs=nenvs, flags=flags)

    runner = Runner(env=env, model=model, nsteps=flags.nsteps, nstack=flags.nstack)
    if flags.replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=flags.nsteps, nstack=flags.nstack, size=flags.buffer_size)
    else:
        buffer = None
    nbatch = nenvs*flags.nsteps
    acer = Acer(runner, model, buffer, flags.log_interval, flags.stats_interval)

    saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=flags.permanent_save_hours)
    checkpoint_dir = os.path.join(flags.save_dir, 'checkpoints')
    checkpoint_path = os.path.join(checkpoint_dir, 'model')
    os.makedirs(checkpoint_dir, exist_ok=True)

    # load checkpoint
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint: {}".format(latest_checkpoint))
        saver.restore(model.sess, latest_checkpoint)
        start_steps = model.GSwrapper.get(model.sess)

        if hasattr(env, 'restore_state'):
            env.restore_state(checkpoint_dir, start_steps)
    else:
        start_steps = 0

    coordinator = tf.train.Coordinator()

    def signal_handler(signal, frame):
        if not coordinator.should_stop():
            coordinator.request_stop()
            print("Stopping training...")
        else:
            print("Stop already requested, please wait...")

    signal.signal(signal.SIGINT, signal_handler)
    print("Press CTRL+C to stop")

    acer.tstart = time.time()
    for acer.steps in range(start_steps, flags.total_timesteps, nbatch):

        # on policy training
        acer.call(on_policy=True)

        # off policy training
        if flags.replay_ratio > 0 and buffer.has_atleast(flags.replay_start):
            n = np.random.poisson(flags.replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

        # saving
        do_save = (((acer.steps//nbatch) + 1) % flags.save_interval == 0) or coordinator.should_stop()
        if do_save:
            save_steps = acer.steps+nbatch

            print("Saving at t=%s" % save_steps)

            model.GSwrapper.set(model.sess, save_steps)
            saver.save(model.sess, save_path=checkpoint_path, global_step=save_steps)

            if hasattr(env, 'save_state'):
                env.save_state(checkpoint_dir, save_steps)

        if coordinator.should_stop():
            break

    env.close()
Ejemplo n.º 5
0
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):

    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model
Ejemplo n.º 6
0
def learn(policy,
          env,
          seed,
          env_id,
          learn_time,
          expert_buffer_size,
          perform=False,
          use_expert=False,
          save_networks=False,
          network_saving_dir=None,
          total_timesteps=int(80e6),
          nsteps=20,
          nstack=4,
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          lr=7e-4,
          lrschedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=10,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          c=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1):

    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space  #Box(84,84,1)
    ac_space = env.action_space  #Discrete(4)

    if use_expert:
        expert = Expert(
            env=env, nsteps=nsteps, nstack=nstack,
            size=expert_buffer_size)  #Exp1:50000; Exp2:25000 ; Exp3:10000
        expert_dir = os.path.join('./expert') + '/expert.pkl'
        file_dir = '/home/zhangxiaoqin/Projects/conda/atari_v1/'
        #expert.load_file_human(file_dir)
        expert.load_file(expert_dir)

    else:
        expert = None

    num_procs = len(env.remotes)  # HACK
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  nstack=nstack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  q_coef=q_coef,
                  gamma=gamma,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  c=c,
                  trust_region=trust_region,
                  alpha=alpha,
                  delta=delta,
                  network_saving_dir=network_saving_dir,
                  use_expert=use_expert,
                  expert=expert)

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    nstack=nstack,
                    env_id=env_id)

    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        nsteps=nsteps,
                        nstack=nstack,
                        size=buffer_size)
    else:
        buffer = None

    if perform:
        model.load()

    nbatch = nenvs * nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(
            0, total_timesteps, nbatch
    ):  #nbatch samples, 1 on_policy call and multiple off-policy calls

        if acer.steps > learn_time and use_expert:
            print('-------------------------')
            print('Reuse the normal networks')
            print('-------------------------')
            use_expert = False
            expert = None

        acer.call(perform, save_networks, use_expert, expert, on_policy=True)

        if replay_ratio > 0 and buffer.has_atleast(
                replay_start) and not perform:
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(perform,
                          save_networks,
                          use_expert,
                          expert,
                          on_policy=False)  # no simulation steps in this

    #dir = os.path.join('./models/', 'test.m')
    #model.save('./models/test_2.pkl')
    #
    #

    env.close()
Ejemplo n.º 7
0
def learn(network,
          env,
          seed=None,
          nsteps=20,
          total_timesteps=int(80e6),
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          lr=7e-4,
          lrschedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=100,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          c=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1,
          load_path=None,
          **network_kwargs):
    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''
    print("Running Acer Simple")

    learn_params = {
        "network": network,
        "seed": seed,
        "nsteps": nsteps,
        "total_timesteps": total_timesteps,
        "q_coef": q_coef,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
        "lr": lr,
        "lrschedule": lrschedule,
        "rprop_epsilon": rprop_epsilon,
        "rprop_alpha": rprop_alpha,
        "gamma": gamma,
        "log_interval": log_interval,
        "buffer_size": buffer_size,
        "replay_ratio": replay_ratio,
        "replay_start": replay_start,
        "c": c,
        "trust_region": trust_region,
        "alpha": alpha,
        "delta": delta,
        "load_path": load_path,
        **network_kwargs
    }

    with open("params.json") as f:
        params = json.load(f)

    params["replay_start"] = min(params["replay_start"], params["buffer_size"])
    params["buffer_size"] = min(params["buffer_size"],
                                params["disk_buffer_size"])

    nsteps, buffer_size, disk_buffer_size = params['nsteps'], params[
        'buffer_size'], params['disk_buffer_size']

    for k, v in params.items():
        if k in learn_params:
            learn_params[k] = v

    # print(locals())

    with open("model_params.pkl", "wb") as f:
        pkl.dump(learn_params, f)

    env, policy, nenvs, ob_space, ac_space, nstack, model = create_model(
        **learn_params)
    # *** UNCOMMENT IF YOU WANT TO LOAD OLD VARIABLES
    load_variables("actor.ckpt")
    # ***

    # runner = HaliteRunner(model=model, env=env, gamma=gamma, nsteps=nsteps)
    runner = HaliteRunner(model)  # reads the params json now
    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        nsteps=nsteps,
                        size=buffer_size,
                        disk_size=disk_buffer_size)
    else:
        buffer = None

    nbatch = nenvs * nsteps
    acer = Acer(runner, model, buffer, log_interval, nsteps)
    acer.tstart = time.time()

    for acer.steps in range(
            0, total_timesteps, nbatch
    ):  #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = replay_ratio  #np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model
Ejemplo n.º 8
0
def learn(policy,
          env,
          seed,
          n_steps=20,
          n_stack=4,
          total_timesteps=int(80e6),
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          learning_rate=7e-4,
          lr_schedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=100,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          correction_term=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1):
    """
    Train an ACER model.

    :param policy: (ACERPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param env: (Gym environment) The environment to learn from
    :param seed: (int) The initial seed for training
    :param n_steps: (int) The number of steps to run for each environment
    :param n_stack: (int) The number of stacked frames
    :param total_timesteps: (int) The total number of samples
    :param q_coef: (float) Q function coefficient for the loss calculation
    :param ent_coef: (float) Entropy coefficient for the loss caculation
    :param max_grad_norm: (float) The maximum value for the gradient clipping
    :param learning_rate: (float) The learning rate
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param rprop_epsilon: (float) RMS prop optimizer epsilon
    :param rprop_alpha: (float) RMS prop optimizer decay
    :param gamma: (float) Discount factor
    :param log_interval: (int) The number of timesteps before logging.
    :param buffer_size: (int) The buffer size in number of steps
    :param replay_ratio: (float) The number of replay learning per on policy learning on average,
                                 using a poisson distribution
    :param replay_start: (int) The minimum number of steps in the buffer, before learning replay
    :param correction_term: (float) The correction term for the weights
    :param trust_region: (bool) Enable Trust region policy optimization loss
    :param alpha: (float) The decay rate for the Exponential moving average of the parameters
    :param delta: (float) trust region delta value
    """
    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)

    n_envs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes)  # HACK
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  n_envs=n_envs,
                  n_steps=n_steps,
                  n_stack=n_stack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  q_coef=q_coef,
                  gamma=gamma,
                  max_grad_norm=max_grad_norm,
                  learning_rate=learning_rate,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lr_schedule=lr_schedule,
                  correction_term=correction_term,
                  trust_region=trust_region,
                  alpha=alpha,
                  delta=delta)

    runner = Runner(env=env, model=model, n_steps=n_steps, n_stack=n_stack)
    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        n_steps=n_steps,
                        n_stack=n_stack,
                        size=buffer_size)
    else:
        buffer = None
    n_batch = n_envs * n_steps
    acer = Acer(runner, model, buffer, log_interval)
    acer.t_start = time.time()
    for acer.steps in range(
            0, total_timesteps, n_batch
    ):  # n_batch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            samples_number = np.random.poisson(replay_ratio)
            for _ in range(samples_number):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()