Example #1
0
def main():
    set_global_seeds(1)
    args = parse_args()
    with U.make_session(4) as sess:  # noqa
        _, env = make_env(args.env)
        act = deepq.build_act(
            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
            q_func=dueling_model if args.dueling else model,
            num_actions=env.action_space.n)

        U.load_state(os.path.join(args.model_dir, "saved"))
        wang2015_eval(args.env, act, stochastic=args.stochastic)
Example #2
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)
    
    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #3
0
        savedir = os.getenv('OPENAI_LOGDIR', None)
    if args.save_azure_container is not None:
        account_name, account_key, container_name = args.save_azure_container.split(":")
        container = Container(account_name=account_name,
                              account_key=account_key,
                              container_name=container_name,
                              maybe_create=True)
        if savedir is None:
            # Careful! This will not get cleaned up. Docker spoils the developers.
            savedir = tempfile.TemporaryDirectory().name
    else:
        container = None
    # Create and seed the env.
    env, monitored_env = make_env(args.env)
    if args.seed > 0:
        set_global_seeds(args.seed)
        env.unwrapped.seed(args.seed)

    if args.gym_monitor and savedir:
        env = gym.wrappers.Monitor(env, os.path.join(savedir, 'gym_monitor'), force=True)

    if savedir:
        with open(os.path.join(savedir, 'args.json'), 'w') as f:
            json.dump(vars(args), f)

    with U.make_session(4) as sess:
        # Create training graph and replay buffer
        def model_wrapper(img_in, num_actions, scope, **kwargs):
            actual_model = dueling_model if args.dueling else model
            return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
        act, train, update_target, debug = deepq.build_train(
Example #4
0
def run(env_id, seed, noise_type, layer_norm, evaluation, share_top_layer,
        **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm, share_top_layer=share_top_layer)
    actor = Actor(nb_actions,
                  layer_norm=layer_norm,
                  share_top_layer=share_top_layer)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    if share_top_layer:
        reward_filename = 'episode_reward_share_top_layer.csv'
    else:
        reward_filename = 'episode_reward_normal.csv'
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   reward_filename=reward_filename,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    elif 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    else:
        raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

# Configure components of DDPG
memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=args.layer_norm)
actor = Actor(nb_actions, layer_norm=args.layer_norm)
# Seed everything to make things reproducible.
seed = args.seed + 1000000 * rank
logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
# tf.reset_default_graph()
set_global_seeds(seed)
env.seed(seed)
if eval_env is not None:
    eval_env.seed(seed)

# Disable logging for rank != 0 to avoid noise.
if rank == 0:
    start_time = time.time()

# Derive the different numbers for the training process
num_timesteps = args.num_timesteps
nb_rollout_steps = args.nb_rollout_steps
nb_epoch_cycles = args.nb_epoch_cycles
nb_epochs = num_timesteps//(nb_rollout_steps*nb_epoch_cycles)

# Just train
Example #6
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor,
        evaluation, bind_to_core, hidden_size, nb_layers, portnum, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    utils.portnum = portnum + rank
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)

    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env,
                                       os.path.join(logdir, 'gym_train'),
                                       force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env,
                                                os.path.join(
                                                    logdir, 'gym_eval'),
                                                force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_size=hidden_size,
                    nb_layers=nb_layers,
                    layer_norm=layer_norm)
    actor = Actor(nb_actions,
                  layer_size=hidden_size,
                  nb_layers=nb_layers,
                  layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #7
0
        )
    from gym.envs.registration import register

    register(
        id='MonsterKong-v0',
        entry_point='baselines.ple.gym_env.monsterkong:MonsterKongEnv',
        kwargs={'map_config': map_config},
    )

    env = gym.make('MonsterKong-v0')
    env = ProcessFrame(env)
else:
    env = create_atari_environment(args.env, sticky_actions=False)

if args.seed > 0:
    set_global_seeds(args.seed)
    env.unwrapped.seed(args.seed)
print("obs shape", env.observation_space.shape)
env = GIFRecorder(video_path=args.video_path + "/{}/".format(args.comment),
                  record_video=True,
                  env=env)
subdir = (
    datetime.datetime.now()).strftime("%m-%d-%Y-%H:%M:%S") + " " + args.comment
# tf_writer = tf.summary.FileWriter(os.path.join(args.log_dir, subdir), tf.get_default_graph())
value_summary = tf.Summary()
qec_summary = tf.Summary()
value_summary.value.add(tag='discount_reward_mean')
value_summary.value.add(tag='non_discount_reward_mean')
# value_summary.value.add(tag='episode')

qec_summary.value.add(tag='qec_mean')
Example #8
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = Rear_Wheel_Path_Tracking_Feedback()
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    #video_train = gym.wrappers.Monitor(env, '/home/jiameng/baselines/baselines/ddpg/controller_')
    controller = rear_wheel_feedback_control()

    if evaluation and rank == 0:
        eval_env = Rear_Wheel_Path_Tracking_Feedback()
        eval_env.eval_flag = True
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
        #video_train = gym.wrappers.Monitor(env, '/home/jiameng/baselines/baselines/ddpg/controller')
        #video_eval = gym.wrappers.Monitor(eval_env, '/home/jiameng/baselines/baselines/ddpg/controller/eval')
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    #nb_actions = controller.params_space().shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    #memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training_controller.train(env=env,
                              eval_env=eval_env,
                              controller=controller,
                              param_noise=param_noise,
                              action_noise=action_noise,
                              actor=actor,
                              critic=critic,
                              memory=memory,
                              **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #9
0
def run(env_id, seed, noise_type, layer_norm, evaluation, custom_log_dir,
        **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    train_recording_path = os.path.join(
        custom_log_dir, env_id, 'train',
        datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
    os.makedirs(train_recording_path)

    # Create envs.
    env = gym.make(env_id)
    env = TraceRecordingWrapper(env,
                                directory=train_recording_path,
                                buffer_batch_size=10)
    logger.info('TraceRecordingWrapper dir: {}'.format(env.directory))
    # env = bench.Monitor(env, os.path.join(train_recording_path, 'log'))

    if evaluation and rank == 0:
        eval_recording_path = os.path.join(
            custom_log_dir, env_id, 'eval',
            datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
        os.makedirs(eval_recording_path)

        eval_env = gym.make(env_id)
        eval_env = TraceRecordingWrapper(eval_env,
                                         directory=eval_recording_path,
                                         buffer_batch_size=10)
        logger.info('TraceRecordingWrapper eval dir: {}'.format(
            eval_env.directory))
        # eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        # env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('DDPG: rank {}: seed={}, logdir={}'.format(
        rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #10
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    ######################################### DEFAULT DATA #######################################
    history, abbreviation = read_stock_history(filepath='utils/datasets/stocks_history_target.h5')
    history = history[:, :, :4]
    history[:, 1:, 0] = history[:, 0:-1, 3] # correct opens
    target_stocks = abbreviation
    num_training_time = 1095

    # get target history
    target_history = np.empty(shape=(len(target_stocks), num_training_time, history.shape[2]))
    for i, stock in enumerate(target_stocks):
        target_history[i] = history[abbreviation.index(stock), :num_training_time, :]
    print("target:", target_history.shape)

    testing_stocks = abbreviation
    test_history = np.empty(shape=(len(testing_stocks), history.shape[1] - num_training_time,
                                   history.shape[2]))
    for i, stock in enumerate(testing_stocks):
        test_history[i] = history[abbreviation.index(stock), num_training_time:, :]
    print("test:", test_history.shape)

    window_length = kwargs['window_length']
    max_rollout_steps = kwargs['nb_rollout_steps']

    ###############################################################################################

    train_env = PortfolioEnv(target_history, 
                             target_stocks, 
                             steps=min(max_rollout_steps, target_history.shape[1]-window_length-2), 
                             window_length=window_length)
    infer_train_env = PortfolioEnv(target_history, 
                                   target_stocks, 
                                   steps=target_history.shape[1]-window_length-2,
                                   window_length=window_length)
    infer_test_env = PortfolioEnv(test_history, 
                                  testing_stocks, 
                                  steps=test_history.shape[1]-window_length-2, 
                                  window_length=window_length)
    kwargs['nb_eval_steps'] = infer_train_env.steps    
    kwargs['nb_eval_test_steps'] = infer_test_env.steps

    print("SPACE:", train_env.observation_space.shape)

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = train_env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=train_env.action_space.shape, observation_shape=train_env.observation_space.shape)
    critic = Critic(nb_actions, layer_norm=layer_norm, asset_features_shape=train_env.asset_features_shape)
    actor = Actor(nb_actions, layer_norm=layer_norm, asset_features_shape=train_env.asset_features_shape)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    train_env.seed(seed)
    infer_train_env.seed(seed)
    infer_test_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=train_env, train_eval_env=infer_train_env, test_eval_env=infer_test_env,
                   param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    train_env.close()
    infer_train_env.close()
    infer_test_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #11
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #12
0
def runExp(
    checkpoint_file,
    logdir,
    omega,
    random_init,
    max_steps,
    hidden_layer_size,
    n_trajectories,
    file_suffix,
    restore_variables,
    overwrite_log,
    env_id,
    seed,
    exact,
    **kwargs,
):

    set_global_seeds(seed)

    # setup environments and policy
    if env_id == 1:

        env = CartPole(max_steps=max_steps)
        env_name = "cartPole"
        policy = Discrete(env.observation_space.shape[0],
                          env.action_space_size, hidden_layer_size)
        model_approx = CartPoleActionNoise()

        if not exact:
            model_approx = NNModel(env.observation_space_size,
                                   1,
                                   name=env_name)

    elif env_id == 2:

        env = Torcs(visual=False, port=kwargs["initial_port"])

        policy = Gaussian(
            env.observation_space_size,
            env.action_space_size,
            hidden_layer_size=hidden_layer_size,
        )

        env_name = "TORCS"

        model_approx = TorcsModel(
            env.observation_space_size,
            env.action_space_size,
            name=env_name + "2_actions",
        )

    elif env_id == 3:

        env = NChainEnv(max_steps=max_steps)
        env_name = "chain"

        # initialize policy parameter
        if not random_init:
            init_theta = 0.2
        else:
            init_theta = np.random.rand()
        policy = OneParameterPolicy(init_theta=init_theta)

        model_approx = ChainModel()

    else:
        raise ValueError("Wrong environment index")

    # initialize environment parameters
    if random_init:
        omega_bounds = env.get_params_bounds()
        omega = np.random.uniform(low=omega_bounds[:, 0],
                                  high=omega_bounds[:, 1])
    env.set_params(omega)

    algo_name = "REMPS"
    experiment_name = (algo_name + "/" + env_name + "-omega" + str(omega) +
                       "-traj" + str(n_trajectories) + "-DualReg" +
                       str(kwargs["dual_reg"]) + "PolReg-" +
                       str(kwargs["policy_reg"]) + "TrainingSet" +
                       str(kwargs["training_set_size"]))

    if exact:
        experiment_name = experiment_name + "exact"
    experiment_name += str(seed)

    if file_suffix is not None:
        experiment_name = experiment_name + "-" + file_suffix

    if logdir is None:
        logdir = ("tf_logs/model_policy_logs/" + experiment_name + "eps-" +
                  str(kwargs["epsilon"]))

    now = datetime.now()

    # do not overwrite log files
    if os.path.isdir(logdir) and (not overwrite_log):
        logdir = logdir + "-" + now.strftime("%Y%m%d-%H%M%S") + "/"

    if checkpoint_file is None:
        experiment_name = ("model-policy/" + experiment_name + "eps-" +
                           str(kwargs["epsilon"]))
        checkpoint_file = "tf_checkpoint/" + experiment_name + "/"

    if not restore_variables:
        # do not overwrite checkpoint files
        if os.path.isdir(checkpoint_file):
            checkpoint_file = checkpoint_file[:-1] + now.strftime(
                "%Y%m%d-%H%M%S") + "/"
        else:
            os.makedirs(checkpoint_file)

    checkpoint_file += "model.ckpt"

    print("Logs will be saved into: " + logdir)
    print("Checkpoints will be saved into: " + checkpoint_file)

    remps_runner.train(
        env=env,
        policy=policy,
        model_approximator=model_approx,
        n_trajectories=n_trajectories,
        checkpoint_file=checkpoint_file,
        logdir=logdir,
        omega=omega,
        restore_variables=restore_variables,
        exact=exact,
        **kwargs,
    )
Example #13
0
def run(seed, parameter_noise, layer_norm, evaluation, flip_state,
        full, action_repeat, fail_reward, exclude_centering_frame,
        checkpoint_dir, log_dir, session_path, last_training_step,
        integrator_accuracy, experiment_name, **kwargs):

    # we don't directly specify timesteps for this script, so make sure that if we do specify them
    # they agree with the other parameters
    if kwargs['num_timesteps'] is not None:
        assert(kwargs['num_timesteps'] == kwargs['nb_epochs'] *
               kwargs['nb_epoch_cycles'] * kwargs['nb_rollout_steps'])

    tmp_log, tmp_chkpt = get_log_and_checkpoint_dirs(experiment_name)

    if log_dir is None:
        log_dir = tmp_log
    if checkpoint_dir is None:
        checkpoint_dir = tmp_chkpt

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Main env
    env = create_environment(False, full, action_repeat,
                             fail_reward, exclude_centering_frame, integrator_accuracy)
    env.reset()
    eval_env = None

    # Parse noise_type
    nb_actions = env.action_space.shape[-1]
    if parameter_noise:
        param_noise = AdaptiveParamNoiseSpec(
            initial_stddev=0.2, desired_action_stddev=0.2)
    else:
        param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(
        mu=np.zeros(nb_actions), sigma=0.2, theta=0.1)

    # Configure components.
    memory = ReplayBufferFlip(int(5e6),
                              flip_state,
                              env.get_observation_names(),
                              env.action_space.shape,
                              env.observation_space.shape)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    critic = Critic(layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(
        rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    # Create LearningSession was passed
    del kwargs['func']
    sess_args = pack_run_params(seed, parameter_noise, layer_norm, evaluation, flip_state,
                                full, action_repeat, fail_reward, exclude_centering_frame, **kwargs)
    learning_session = LearningSession(
        session_path, checkpoint_dir, log_dir, last_training_step, **sess_args)

    del kwargs['num_timesteps']
    del kwargs['noise_type']
    training.train(env=env, action_noise=action_noise, param_noise=param_noise,
                   actor=actor, critic=critic, memory=memory,
                   visualize=False, full=full, action_repeat=action_repeat,
                   fail_reward=fail_reward, exclude_centering_frame=exclude_centering_frame,
                   learning_session=learning_session, integrator_accuracy=integrator_accuracy,
                   **kwargs)

    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #14
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank==0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))