def run_baselines(env, seed, log_dir):
    '''
    Create baselines model and training.

    Replace the trpo and its training with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return
    '''
    with tf.compat.v1.Session().as_default():
        baselines_logger.configure(log_dir)

        def policy_fn(name, ob_space, ac_space):
            return MlpPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             hid_size=32,
                             num_hid_layers=2)

        trpo_mpi.learn(env,
                       policy_fn,
                       timesteps_per_batch=1024,
                       max_kl=0.01,
                       cg_iters=10,
                       cg_damping=0.1,
                       max_timesteps=int(1e6),
                       gamma=0.99,
                       lam=0.98,
                       vf_iters=5,
                       vf_stepsize=1e-3)
        env.close()

    return osp.join(log_dir, 'progress.csv')
Example #2
0
def train(env_id, num_frames, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)


    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json"%rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
Example #3
0
def train(env_id, num_timesteps, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
Example #4
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
Example #5
0
def train(env,
          seed,
          policy_entcoeff,
          num_timesteps,
          num_iters,
          checkpoint_dir,
          gamma,
          task_name=None):

    from baselines.trpo_mpi import trpo_mpi
    # Set up for MPI seed
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env.seed(workerseed)
    trpo_mpi.learn(network=args.network,
                   env=env,
                   total_timesteps=num_timesteps,
                   ent_coef=policy_entcoeff,
                   max_iters=num_iters,
                   ckpt_dir=checkpoint_dir,
                   timesteps_per_batch=args.batchsize,
                   max_kl=args.max_kl,
                   cg_iters=args.cg_iters,
                   cg_damping=args.cg_damping,
                   gamma=gamma,
                   lam=0.97,
                   vf_iters=args.vf_iters,
                   vf_stepsize=args.vf_stepsize,
                   task_name=task_name,
                   num_layers=args.policy_hidden_layer,
                   num_hidden=args.policy_hidden_size)
Example #6
0
def train(num_timesteps):

    env = GRID(grid_size=36, square_size=4, stochastic=True)
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    def policy_fn(name, ob_space, ac_space):
        return CnnPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space)

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
def train(env_id, num_timesteps, seed, dropout_on_V, dropout_tau_v, lengthscale_V, V_keep_prob, mc_samples, override_reg, optim_stepsize, vf_hid_size, activation_vf, sample_dropout):
    from baselines.ppo1 import mlp_policy
    from baselines.trpo_mpi import trpo_mpi
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    #### this guy is deciding if we do dropout on V or not
    dropout_on_V=dropout_on_V

    pol_tau = 1.

    def policy_fn(name, ob_space, ac_space):

        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            ## MAIN CHANGES
            hid_size_V=vf_hid_size,
            hid_size_actor=64, num_hid_layers=2,
            V_keep_prob=V_keep_prob,mc_samples=mc_samples,\
            layer_norm=False,activation_critic=activation_vf,\
            activation_actor=tf.nn.relu , dropout_on_V=dropout_on_V, sample_dropout=sample_dropout)

    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=optim_stepsize,
        ## MAIN CHANGES
        dropout_on_V=dropout_on_V,
        dropout_tau_V=dropout_tau_v,
        override_reg=override_reg)
    env.close()
def train_trpo(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)

    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(workerseed)

    #timesteps_per_batch=1024
    timesteps_per_batch=2048
    
    #trpo_mpi.learn(network='mlp', env=env, total_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch,
 #                  max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3,seed=workerseed,
#                   num_layers=2, num_hidden=32)

    trpo_mpi.learn(network='mlp',env=env,seed=workerseed,total_timesteps=num_timesteps)
    env.close()
def train(env_id, num_timesteps, seed, flight_log_dir, ckpt_dir, model_ckpt_path):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 1000000 * rank
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    flight_log = FlightLog(flight_log_dir)
    env = gym.make(env_id)
    env.seed(workerseed)
    set_global_seeds(workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5,
        vf_stepsize=1e-3,
            flight_log = flight_log,
            ckpt_dir = ckpt_dir,
            model_ckpt_path = model_ckpt_path
            )
    env.close()
Example #10
0
def train(env_id, rank, environment_args, stacked_obs, num_hidden_units,
          max_iters, checkpoint_dir, log_dir, timesteps_per_batch, render,
          seed):

    sess = U.single_threaded_session()
    sess.__enter__()

    if rank == 0:
        logger.configure()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    if environment_args is not None:
        try:
            env.unwrapped.set_environment_config(environment_args)
        except:
            print("Can't set the configuration to the environment!")

    if rank == 0:
        with open(osp.join(checkpoint_dir, "args.txt"), "a") as f:
            f.write("\nEnvironment argument:\n")
            for k, v in env.unwrapped._config.items():
                f.write("{}: {}\n".format(k, v))

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=num_hidden_units,
                         num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), str(rank)),
                        allow_early_resets=True)

    # Support the stacked the frames
    env = FrameStack_Mujoco(env, stacked_obs)
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env,
                   policy_fn,
                   checkpoint_dir,
                   log_dir,
                   render=render,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_iters=max_iters,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
Example #11
0
def run_baselines(env, seed, log_dir):
    """Create Baseline model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.compat.v1.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    set_global_seeds(seed)

    def policy_fn(name, ob_space, ac_space):
        """Create policy for baselines.

        Args:
            name (str): Policy name.
            ob_space (gym.spaces.Box) : Observation space.
            ac_space (gym.spaces.Box) : Action space.

        Returns:
            baselines.ppo1.mlp_policy: MLP policy for baselines.

        """
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hyper_parameters['hidden_sizes'][0],
                         num_hid_layers=len(hyper_parameters['hidden_sizes']))

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=hyper_parameters['batch_size'],
                   max_kl=hyper_parameters['max_kl'],
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=(hyper_parameters['batch_size'] *
                                  hyper_parameters['n_epochs']),
                   gamma=hyper_parameters['discount'],
                   lam=hyper_parameters['gae_lambda'],
                   vf_iters=5,
                   vf_stepsize=1e-3)

    return osp.join(log_dir, 'progress.csv')
Example #12
0
def main():
    # Create the Create2 mover environment
    env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines trpo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Example #13
0
def run_baselines(env, seed, log_dir):
    """Create Baseline model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.compat.v1.ConfigProto(allow_soft_placement=True,
                                      intra_op_parallelism_threads=ncpu,
                                      inter_op_parallelism_threads=ncpu)
    tf.compat.v1.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    set_global_seeds(seed)

    policy_network = 'mlp'
    trpo_mpi.learn(network=policy_network,
                   env=env,
                   total_timesteps=hyper_parameters['batch_size'] *
                   hyper_parameters['n_epochs'],
                   timesteps_per_batch=hyper_parameters['batch_size'],
                   gamma=hyper_parameters['discount'],
                   lam=hyper_parameters['gae_lambda'],
                   max_kl=hyper_parameters['max_kl'],
                   cg_iters=10,
                   cg_damping=0.1,
                   vf_iters=5,
                   vf_stepsize=1e-3)

    log_file_path = osp.join(log_dir, 'progress.csv')

    with open(log_file_path, 'r') as rf:
        reader = csv.reader(rf)
        columns = [[
            'Evaluation/AverageReturn' if c == 'EpRewMean' else c
            for c in next(reader)
        ] + ['Evaluation/Iteration']]
        new_lines = columns + [line + [i] for i, line in enumerate(reader)]

    with open(log_file_path, 'w') as wf:
        writer = csv.writer(wf, lineterminator='\n')
        writer.writerows(new_lines)

    return log_file_path
Example #14
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create a new base directory like /tmp/openai-2018-05-21-12-27-22-552435
    log_dir = os.path.join(
        energyplus_logbase_dir(),
        datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
    if not os.path.exists(log_dir + '/output'):
        os.makedirs(log_dir + '/output')
    os.environ["ENERGYPLUS_LOG"] = log_dir
    model = os.getenv('ENERGYPLUS_MODEL')
    if model is None:
        print('Environment variable ENERGYPLUS_MODEL is not defined')
        os.exit()
    weather = os.getenv('ENERGYPLUS_WEATHER')
    if weather is None:
        print('Environment variable ENERGYPLUS_WEATHER is not defined')
        os.exit()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        print('train: init logger with dir={}'.format(log_dir))  #XXX
        logger.configure(log_dir)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    env = make_energyplus_env(env_id, workerseed)

    trpo_mpi.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        #timesteps_per_batch=1*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        timesteps_per_batch=16 * 1024,
        max_kl=0.01,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.99,
        lam=0.98,
        vf_iters=5,
        vf_stepsize=1e-3)
    env.close()
Example #15
0
def run(cfg, num_timesteps, seed, hid_size, **kwargs):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    logger.configure(dir_path, ['stdout'])

    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = GRLEnv(cfg)
    env.set_test(False)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=hid_size,
                         num_hid_layers=2)

    env = MyMonitor(env,
                    osp.join(logger.get_dir(), kwargs['output']),
                    report='learn')
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    if kwargs['evaluation']:
        trpo_mpi.play(sess,
                      env,
                      policy_fn,
                      timesteps_per_batch=1024,
                      load_file=kwargs['load_file'])
    else:
        trpo_mpi.learn(sess,
                       env,
                       policy_fn,
                       timesteps_per_batch=1024,
                       max_kl=0.01,
                       cg_iters=10,
                       cg_damping=0.1,
                       max_timesteps=num_timesteps,
                       gamma=0.99,
                       lam=0.98,
                       vf_iters=5,
                       vf_stepsize=1e-3,
                       **kwargs)

    env.close()
Example #16
0
def train(env_id, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return CnnPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space)

    env = bench.Monitor(
        env,
        logger.get_dir()
        and osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    env.seed(workerseed)

    task_name = "trpo." + args.env.split("-")[0] + "." + ("%.2f" %
                                                          args.entcoeff)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=512,
                   max_kl=0.001,
                   cg_iters=10,
                   cg_damping=1e-3,
                   max_timesteps=args.num_timesteps,
                   gamma=0.98,
                   lam=1.0,
                   vf_iters=3,
                   vf_stepsize=1e-4,
                   entcoeff=args.entcoeff,
                   sample_stochastic=args.sample_stochastic,
                   task_name=task_name,
                   save_per_iter=args.save_per_iter,
                   ckpt_dir=args.checkpoint_dir,
                   load_model_path=args.load_model_path,
                   task=args.task)
    env.close()
Example #17
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    # sess = U.single_threaded_session()
    # sess.__enter__()
    gpu_options = tf.GPUOptions(allow_growth=False,
                                per_process_gpu_memory_fraction=0.2)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    sess = tf.Session(config=tf_config)
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # env = make_mujoco_env(env_id, workerseed)
    env = normalize(InvertedDoublePendulumEnv(), normalize_obs=False)
    env_t = normalize(InvertedDoublePendulumEnv(), normalize_obs=False)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3,
                   tester=tester)
    time_step_holder = TimeStepHolder(0, 0)
    tester = Tester(episodes=100,
                    period=10,
                    env=env_t,
                    time_step_holder=time_step_holder,
                    file='./results',
                    session=sess)

    env.close()
Example #18
0
def train_trpo(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    set_global_seeds(workerseed)
    env = HistoryEnv(env_id,
                     hist_len=1,
                     history_type='fully_observable',
                     kwargs={
                         'board_size': 5,
                         'num_rocks': 7
                     })
    #env = HistoryEnv(env_id, hist_len=15)
    print("ob_space: " + str(env.observation_space))
    print("ac_space: " + str(env.action_space))
    env.seed(workerseed)

    #timesteps_per_batch=1024
    #timesteps_per_batch=2048
    timesteps_per_batch = 5000

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
Example #19
0
def train(env_id, num_timesteps, hidden_size, num_hidden_layers, seed, rank):
    with U.make_session(3) as sess:
        worker_seed = seed + 10000 * rank
        set_global_seeds(worker_seed)

        # env = bench.Monitor(env, logger.get_dir() and
        # osp.join(logger.get_dir(), str(rank)))

        try:
            env = gym.make(env_id)
            env.seed(worker_seed)

            # Rendering and saving callback
            episode = 0

            def episode_callback(locals, globals):
                nonlocal episode
                episode += 1
                print("----- Episode {} -----".format(episode))
                env.render()
                if episode % 20 == 0:
                    save(sess)

            # Policy function
            policy_fn = lambda name, ob_space, ac_space: MlpPolicy(
                name=name,
                ob_space=env.observation_space,
                ac_space=env.action_space,
                hid_size=hidden_size,
                num_hid_layers=num_hidden_layers
            )

            # Learning
            trpo_mpi.learn(
                env,
                policy_fn,
                timesteps_per_batch=1024,
                max_kl=0.01,
                cg_iters=10,
                cg_damping=0.1,
                max_timesteps=num_timesteps,
                gamma=0.99,
                lam=0.98,
                vf_iters=5,
                vf_stepsize=1e-3,
                callback=episode_callback)
        finally:
            env.close()
Example #20
0
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = make_vec_env(env_id,
                       env_type,
                       num_env,
                       seed,
                       wrapper_kwargs=Monitor,
                       start_index=0,
                       reward_scale=1.0,
                       flatten_dict_observations=True,
                       gamestate=None)

    act = trpo_mpi.learn(env=env,
                         network='mlp',
                         total_timesteps=0,
                         load_path=modelsdir + "model")

    obs, done = env.reset(), False
    episode_rew = 0

    while True:
        obs, rew, done, _ = env.step(act.step(obs)[0])
        episode_rew += rew[0] if isinstance(env, VecEnv) else rew
        done = done.any() if isinstance(done, np.ndarray) else done
        if done:
            print('episode_rew={}'.format(episode_rew))
            episode_rew = 0
            obs = env.reset()
Example #21
0
def main():

    args = parse_args()

    format_strs = ['log', 'csv', 'stdout']

    if args.tensorboard:
        format_strs.append('tensorboard')

    config = parse_config(args.config)

    outdir = os.path.join(args.outdir,
                          os.path.splitext(os.path.basename(args.config))[0])
    logger.configure(dir=outdir, format_strs=format_strs)

    env_type, env_id = get_env_type(GAME_ENVIRONMENT)
    env = make_vec_env(env_id, env_type, 1, args.seed)

    model = trpo_mpi.learn(env=env,
                           network=NETWORK_ARCHITECTURE,
                           total_timesteps=args.total_timesteps,
                           **config)

    env.close()

    if args.save:
        model.save(os.path.join(outdir, 'model'))
Example #22
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
        return CnnPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         sess=sess,
                         placeholders=placeholders)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=512,
                   max_kl=0.001,
                   cg_iters=10,
                   cg_damping=1e-3,
                   max_timesteps=int(num_timesteps * 1.1),
                   gamma=0.98,
                   lam=1.0,
                   vf_iters=3,
                   vf_stepsize=1e-4,
                   entcoeff=0.00)
    env.close()
Example #23
0
def train(args):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=32,
                         num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir()
        and osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    task_name = "trpo." + args.env_id.split("-")[0] + "." + ("%.2f" %
                                                             args.entcoeff)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=args.num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3,
                   sample_stochastic=args.sample_stochastic,
                   task_name=task_name,
                   save_per_iter=args.save_per_iter,
                   ckpt_dir=args.checkpoint_dir,
                   load_model_path=args.load_model_path)
    env.close()
Example #24
0
def train(env_id, num_timesteps, task, seed):
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    logger.configure()
    if task == "grid":
        env = bench.Monitor(gym.make("target-v0"), logger.get_dir())
    elif task == "dynamics":
        env = bench.Monitor(gym.make("target-dynamics-v0"), logger.get_dir())
    else:
        raise ValueError("task should be either grid or dynamics instead " +
                         task + " was given.")
    trpo_mpi.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        gamma=1.0,
        timesteps_per_batch=5000,
        max_kl=0.01,
        entcoeff=0.02,
        vf_iters=5,
        vf_stepsize=1e-3,
        lam=0.95,
        cg_iters=10,
        cg_damping=0.1,
    )
    dir = logger.get_dir()
    with open("scripts/data/data_directories.txt", 'a') as file:
        file.write(dir + "\n")
    with open(dir + "/metadata.txt", 'w') as file:
        file.write(task + "," + str(num_timesteps))
    results_plotter_terminal.plot_results([dir], num_timesteps,
                                          results_plotter_terminal.X_TIMESTEPS,
                                          "Target Set with Dynamics")
    #for this to have at least 99 episodes. See line 20 in rolling_window in results_plotter.py from open ai baselines
    #If there are too few samples, that dimension will be negative and numpy will cause this to crash
    env.close()
Example #25
0
def train(env_id, num_timesteps, seed, outdir):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()
    logdir = os.path.join(outdir, env_id)
    os.makedirs(logdir, exist_ok=True)

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    logger.configure(logdir)

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=32,
                         num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir()
        and osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
Example #26
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    # for parallel executing, each process has a rand
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    gym.logger.setLevel(logging.WARN)

    set_global_seeds(workerseed)
    env.seed(workerseed)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
def train_trpo(env_id, num_timesteps, seed, hist_len, block_high, nsteps,
               hid_size, give_state):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hid_size,
                         num_hid_layers=2)

    set_global_seeds(workerseed)

    env = make_control_env(env_id,
                           workerseed,
                           hist_len=hist_len,
                           block_high=block_high,
                           not_guided=True,
                           give_state=False)
    env.seed(workerseed)

    timesteps_per_batch = nsteps

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
Example #29
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
            hid_size=32, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
Example #30
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
            return MlpPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             hid_size=32,
                             num_hid_layers=2,
                             sess=sess,
                             placeholders=placeholders)

        env = make_mujoco_env(env_id, workerseed)
        trpo_mpi.learn(env,
                       policy_fn,
                       timesteps_per_batch=1024,
                       max_kl=0.01,
                       cg_iters=10,
                       cg_damping=0.1,
                       max_timesteps=num_timesteps,
                       gamma=0.99,
                       lam=0.98,
                       vf_iters=5,
                       vf_stepsize=1e-3)
        env.close()
    def trpo_baselines(log_dir, env_id, seed):
        """Create Baseline model and training.

        Args:
            log_dir (str): Experiment log directory.
            env_id (str): Environment id of the task.
            seed (int): Random positive integer for the trial.

        """
        # Set up TF Session
        ncpu = max(multiprocessing.cpu_count() // 2, 1)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=ncpu,
                                inter_op_parallelism_threads=ncpu)
        tf.compat.v1.Session(config=config).__enter__()

        # Set up logger for baselines
        configure(dir=log_dir,
                  format_strs=['stdout', 'log', 'csv', 'tensorboard'])
        baselines_logger.info('rank {}: seed={}, logdir={}'.format(
            0, seed, baselines_logger.get_dir()))

        set_global_seeds(seed)

        env = AutoStopEnv(env_name=env_id, max_path_length=100)

        trpo_mpi.learn(network='mlp',
                       env=env,
                       total_timesteps=hyper_parameters['batch_size'] *
                       hyper_parameters['n_epochs'],
                       timesteps_per_batch=hyper_parameters['batch_size'],
                       gamma=hyper_parameters['discount'],
                       lam=hyper_parameters['gae_lambda'],
                       max_kl=hyper_parameters['max_kl'],
                       cg_iters=10,
                       cg_damping=0.1,
                       vf_iters=5,
                       vf_stepsize=1e-3)
def main():
    num_env = 1
    env_id = "CartPole-v0"
    env_type = "classic_control"
    seed = None

    env = make_vec_env(env_id,
                       env_type,
                       num_env,
                       seed,
                       wrapper_kwargs=None,
                       start_index=0,
                       reward_scale=1.0,
                       flatten_dict_observations=True,
                       gamestate=None)

    act = trpo_mpi.learn(env=env, network='mlp', total_timesteps=500000)
args = parser.parse_args()

sess = U.single_threaded_session()
sess.__enter__()

rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
    logger.set_level(logger.DISABLED)
workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)

# create the environment
env = gym.make(str(args.environment))
# initial_observation = env.reset()

def policy_fn(name, ob_space, ac_space):
    return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
        hid_size=32, num_hid_layers=2)
# env = bench.Monitor(env, logger.get_dir() and
#     osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
# gym.logger.setLevel(logging.WARN)
with tf.Session() as sess:
    trpo_mpi.learn(env, policy_fn,
        timesteps_per_batch=1024,
        max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=args.num_timesteps,
        gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3,
        save_model_with_prefix="",
        outdir="/tmp/experiments/"+str(args.environment)+"/TRPO/")