def test_function():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        y = tf.placeholder(tf.int32, (), name="y")
        z = 3 * x + 2 * y
        lin = function([x, y], z, givens={y: 0})

        with single_threaded_session():
            initialize()

            assert lin(2) == 6
            assert lin(2, 2) == 10
def test_multikwargs():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        with tf.variable_scope("other"):
            x2 = tf.placeholder(tf.int32, (), name="x")
        z = 3 * x + 2 * x2

        lin = function([x, x2], z, givens={x2: 0})
        with single_threaded_session():
            initialize()
            assert lin(2) == 6
            assert lin(2, 2) == 10
Exemple #3
0
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch,
          reward_rule):

    rank = MPI.COMM_WORLD.Get_rank()

    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if seed is not None else None
    set_global_seeds(workerseed)

    # MAKE ENV_NAS
    cs_kwargs = space['create_structure'].get('kwargs')
    if cs_kwargs is None:
        structure = space['create_structure']['func']()
    else:
        structure = space['create_structure']['func'](**cs_kwargs)

    num_nodes = structure.num_nodes
    timesteps_per_actorbatch = num_nodes * num_episodes_per_batch
    num_timesteps = timesteps_per_actorbatch * num_episodes

    env = NasEnv(space, evaluator, structure)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return lstm_ph.LstmPolicy(name=name,
                                  ob_space=ob_space,
                                  ac_space=ac_space,
                                  num_units=32)

    pposgd_sync.learn(env,
                      policy_fn,
                      max_timesteps=num_timesteps,
                      timesteps_per_actorbatch=timesteps_per_actorbatch,
                      clip_param=0.2,
                      entcoeff=0.01,
                      optim_epochs=4,
                      optim_stepsize=1e-3,
                      optim_batchsize=15,
                      gamma=0.99,
                      lam=0.95,
                      schedule='linear',
                      reward_rule=reward_rule)
    env.close()
def train(num_iter, seed, evaluator, num_episodes_per_iter):

    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if seed is not None else None
    set_global_seeds(workerseed)

    # MAKE ENV_NAS
    timesteps_per_episode = 10
    timesteps_per_actorbatch = timesteps_per_episode * num_episodes_per_iter
    num_timesteps = timesteps_per_actorbatch * num_iter

    env = MathEnv(evaluator)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return lstm_ph.LstmPolicy(name=name,
                                  ob_space=ob_space,
                                  ac_space=ac_space,
                                  num_units=64)

    pposgd_sync_ph.learn(
        env,
        policy_fn,
        max_timesteps=int(num_timesteps),
        timesteps_per_actorbatch=timesteps_per_actorbatch,
        clip_param=0.2,
        entcoeff=0.01,  #0.01,
        optim_epochs=4,
        optim_stepsize=1e-3,
        optim_batchsize=10,
        gamma=0.99,  # 0.99
        lam=0.95,  # 0.95
        schedule='linear',
        reward_rule=reward_for_final_timestep)
    env.close()