Esempio n. 1
0
def learn(env_path, seed, max_steps, reward_range, base_port, unity_arguments,
          summary_writer):
    env = VecFrameStack(_make_a2c(env_path,
                                  num_env=8,
                                  seed=seed,
                                  reward_range=reward_range,
                                  base_port=base_port,
                                  unity_arguments=unity_arguments),
                        nstack=4)

    model = learn_a2c(
        policy=CnnPolicy,
        env=env,
        seed=seed,
        ent_coef=0.01,
        nsteps=5,
        total_timesteps=max_steps,
        callback=_create_summary_callback(summary_writer=summary_writer))

    try:
        env.close()
    except Exception as e:
        print("Failed to close environment: " + str(e))

    return model
Esempio n. 2
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, ckpt_path,
          hparams):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'cnn_attention':
        policy_fn = CnnAttentionPolicy

    video_log_dir = os.path.join(hparams['base_dir'], 'videos',
                                 hparams['experiment_name'])
    env = VecFrameStack(
        make_atari_env(env_id,
                       num_env,
                       seed,
                       video_log_dir=video_log_dir,
                       write_attention_video='attention' in policy,
                       hparams=hparams), 4)

    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          ckpt_path=ckpt_path,
          hparams=hparams)
    env.close()
Esempio n. 3
0
def train(num_timesteps,
          env_name,
          seed,
          policy,
          lrschedule,
          num_env,
          entrophy,
          lr,
          save_name=None):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'i2a':
        policy_fn = I2ANetwork
    env = VecFrameStack(make_doom_env(num_env, 0, env_name), 4)
    if save_name is None:
        save_name = env_name
    learn(policy_fn,
          env,
          seed,
          save_name=save_name,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          log_interval=500,
          save_interval=1000,
          cont=True,
          ent_coef=entrophy,
          lr=lr)
    env.close()
Esempio n. 4
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule)
    env.close()
Esempio n. 5
0
def train(env_id, num_timesteps, seed, num_cpu):
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu)
    env.close()
Esempio n. 6
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    #env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.CNNPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"))

    env = make_vec_env(args.env_id, 'atari', 1, args.seed,
                       wrapper_kwargs={
                           'clip_rewards':False,
                           'episode_life':False,
                       })
    env = VecFrameStack(env, 4)

    #env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = LMDB_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    else:
        raise NotImplementedError
    env.close()
Esempio n. 7
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == u'cnn':
        policy_fn = CnnPolicy
    elif policy == u'lstm':
        policy_fn = LstmPolicy
    elif policy == u'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Esempio n. 8
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Esempio n. 9
0
def main():
    numOfTests = 40
    env_args = {
        'episode_life': False,
        'clip_rewards': False,
        'crop': True,
        'rotate': True
    }
    env = VecFrameStack(
        make_vec_env("gvgai-zelda-lvl0-v0",
                     numOfTests,
                     43,
                     wrapper_kwargs=env_args), 4)
    policy = build_policy(env, "cnn")
    model = Model(policy=policy, env=env, nsteps=5)
    model.load('logs/test_4*5_r1_right/checkpoints/260000')
    nh, nw, nc = env.observation_space.shape
    result = dict()
    for j in range(201, 601):
        # obs = np.zeros((numOfTests, nh, nw, nc), dtype=np.uint8)
        done = np.array([False] * numOfTests)
        env.venv.set_level(
            "GVGAI_GYM/gym_gvgai/envs/games/zelda_v0/zelda_lvl{}.txt".format(
                j))
        obs = env.reset()
        infos = [False] * numOfTests
        # dones = [False] * numOfTests

        while not all(done):
            actions, values, state, _ = model.step(obs)
            obs, rewards, dones, info = env.step(actions)
            done[np.where(dones != False)] = True
            for i in np.where(dones != False)[0].tolist():
                if not infos[i]:
                    # print(info)
                    del info[i]["grid"]
                    del info[i]["ascii"]
                    infos[i] = info[i]
            # print(np.where(dones!=False)[0])
            # print(done)
            # print(infos)

        # print(dones)
        win = [1 if (i['winner'] == 'PLAYER_WINS') else 0 for i in infos]
        # score = [i['episode']['r'] for i in infos]
        # steps = [i['episode']['l'] for i in infos]
        # time = [i['episode']['t'] for i in infos]
        print("level {}".format(j), win)
        result[j] = infos

    env.close()

    with open("result_4*5_r1_right_200~600", "wb") as f:
        pickle.dump(result, f)
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    dict = {}
    dict['clip_rewards']=False
    env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=dict), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Esempio n. 11
0
def train(env_id, num_timesteps, seed, num_cpu, num_env):
    env = VecFrameStack(
        # make_atari_env(env_id, num_cpu, seed),
        make_distributed_env(env_id, num_env, seed),
        # make_old_dist_env(env_id, num_env, seed),
        4)
    policy_fn = partial(CnnPolicy, one_dim_bias=True)
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu)
    env.close()
Esempio n. 12
0
def train(env_id, num_timesteps, seed, num_cpu):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = partial(CnnPolicy, one_dim_bias=True)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()
Esempio n. 13
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, save_path,
          load_path, wrapper_kwargs):
    env_args = {'episode_life': False, 'clip_rewards': False}
    env_args.update(wrapper_kwargs)
    env = VecFrameStack(
        make_vec_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)
    # env = make_vec_env(env_id, num_env, seed, wrapper_kwargs=env_args)
    model = learn(policy,
                  env,
                  seed,
                  total_timesteps=int(num_timesteps * 1.1),
                  lrschedule=lrschedule,
                  load_path=load_path)
    model.save(save_path)
    env.close()
Esempio n. 14
0
def main():
    env = VecFrameStack(make_sf2_env(), 1)
    obs = env.reset()
    n_steps = 128,  # 5 * FPS
    options = {
        'network': 'mlp',  # 'impala_cnn'
        'env': venv,
        'total_timesteps': 40000000,
        'nsteps': n_steps,  # 5 * FPS,  # TODO: Do we still need to pass nsteps here?
        'q_coef': 1.0,
        'ent_coef': 0.001,
        'max_grad_norm': 10,
        'lr': 7e-4,
        'lrschedule': 'linear',
        'rprop_epsilon': 1e-5,
        'rprop_alpha': 0.99,
        'gamma': 0.99,
        'log_interval': 1000,
        'buffer_size': 50000,
        'replay_ratio': 4,
        'replay_start': 10000,
        'c': 10.0,
        'trust_region': True,
        'delta': 1,
        'alpha': 0.99,
        # 'load_path': MODEL_PATH,
        'save_interval': 1000,
        # neuronal network parameters
        'activation': tf.nn.relu,
        'num_layers': 2,  # 4, 2
        'num_hidden': 48,  # 64, 64
        'layer_norm': False,
    }
    models = (
        Acer(**options),
        Acer(**options)
    )
    runner = Runner(env, models, n_steps)
    while True:
        runner.run()
        # obs, rew, done, info = env.step((
        #     env.action_space.sample(),
        #     env.action_space.sample()
        # ))
        # env.render()
        # if done:
        #     obs = env.reset()
    env.close()
Esempio n. 15
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    #env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    env = VecFrameStack(make_custom_env('gridworld-v0', num_env, seed), 1)
    act = learn(policy_fn,
                env,
                seed,
                total_timesteps=int(num_timesteps * 1.1),
                lrschedule=lrschedule)
    act.save('a2c_bopen.pkl')
    env.close()
Esempio n. 16
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env,
          replay_lambda=1, replay_loss=None, ss_rate=1, thetas=None):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    if replay_loss is not None:
        learn_staged(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule, replay_lambda=replay_lambda, ss_rate=ss_rate,
         replay_loss=replay_loss, thetas=thetas)
    else:
        learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Esempio n. 17
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env,
          v_ex_coef, r_ex_coef, r_in_coef, lr_alpha, lr_beta, no_ex, no_in):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'cnn_int':
        policy_fn = CnnPolicyIntrinsicReward
    else:
        raise NotImplementedError
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.01), lrschedule=lrschedule,
          v_ex_coef=v_ex_coef, r_ex_coef=r_ex_coef, r_in_coef=r_in_coef,
          lr_alpha=lr_alpha, lr_beta=lr_beta, no_ex=no_ex, no_in=no_in)
    env.close()
Esempio n. 18
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env,load_path,
          algo='use_svib_uniform', ib_alpha=1e-3):
    if policy == 'cnn_svib':
        policy_fn = CnnPolicySVIB
    else:
        policy_fn = CnnPolicySVIB
    if 'NoFrameskip' in env_id:
        env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
        test_env = VecFrameStack(make_atari_env(env_id, num_env, seed+1), 4)
    else:
        env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed), 4)
        test_env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed+1), 4)
        # train_mine_env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed), 4)
    reward_list, value_list = learn(policy_fn, env, test_env, seed, total_timesteps=int(num_timesteps),
                        lrschedule=lrschedule, load_path=load_path, algo=algo, ib_alpha=ib_alpha)
    env.close()
    return reward_list, value_list
Esempio n. 19
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    print('train() called')
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed),
                        4)  # Make "num_env" environments

    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule)  # Learn
    env.close()
Esempio n. 20
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, args):
    if policy == 'i2a':
        policy_fn = I2A
    elif policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(
        make_atari_env('MsPacmanNoFrameskip-v0', num_env, seed), 4)
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          args=args)
    env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, sil_update,
          sil_beta):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env_args = {'episode_life': False, 'clip_rewards': False}
    env = VecFrameStack(
        make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          sil_update=sil_update,
          sil_beta=sil_beta)
    env.close()
Esempio n. 22
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    nstates = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
        nstates = 512
    elif policy == 'caps':
        policy_fn = CapsulePolicy
        
    # TODO
    # DEBUG:
    # Changed ent_coef to zero
    # To undo, simply omit ent_coef from arguments (use default)
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn, env, seed, nsteps=5, nstates=nstates, total_timesteps=int(num_timesteps * 1.1), sc_coef=None, lrschedule=lrschedule)
    env.close()
Esempio n. 23
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    # VecFrameStack
    # make_atari_env() : launches 'num_env' subprocess each with 'env_id' and for i in num_env: seed+=seed+i
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    print("~~~~~~~~~~~~~ run_atari: len(env): " + str(env.nstack))
    print("~~~~~~~~~~~~~ run_atari: str(env): " + str(env))
    # above prints : run_atari: str(env): <baselines.common.vec_env.vec_frame_stack.VecFrameStack object at 0x1c22ee06d8>
    print("_____________________________________________ policy: " +
          str(policy))
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule)
    env.close()
Esempio n. 24
0
def train(env_id, N_itr, seed, policy, lr, lrschedule, num_env, log_path,
          save_interval, alg):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    if alg == 'a2c':
        learn_a2c(policy=policy_fn,
                  env=env,
                  seed=seed,
                  N_itr=int(N_itr),
                  lr=lr,
                  lrschedule=lrschedule,
                  nsteps=128,
                  save_interval=save_interval,
                  save_path=log_path)  #,load_path="./Data/a2cTest/a2c_1.pkl")
    elif alg == 'ppo2':
        learn_ppo2(
            policy=policy_fn,
            env=env,
            seed=seed,
            nsteps=128,
            nminibatches=4,
            lam=0.95,
            gamma=0.99,
            noptepochs=4,
            log_interval=1,
            ent_coef=.01,
            lr=lambda f: f * lr,
            cliprange=lambda f: f * 0.1,
            N_itr=int(N_itr),
            save_interval=save_interval,
            save_path=log_path
        )  #there are defalut values from original openai/baselines github
    env.close()
Esempio n. 25
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, param):
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    tf.Session(config=config).__enter__()
    # change parameter of env to start multi envs
    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          param=param,
          nsteps=16)
    env.close()
Esempio n. 26
0
def train(params):
    policy_fn = CnnPolicy

    dataflow_config = {
        'future_rewards': True,             # Should return future discounted rewards?
        'exclude_zero_actions': False,      # Should exclude zero actions
        'remap_actions': False,             # Should remap to smaller action set?
        'clip_rewards': True,               # Clip rewards to [-1, 1]
        'monte-specific-blackout': True,    # Cover up score and lives indicators
        'pong-specific-blackout': False,    # Cover up scores in pong
        'gamma': params.gamma,              # reward discount factor
        'frame_history': 4,                 # What is minimum number of expert frames since beginning of episode?
        'frameskip': 4,                     # frameskip
        'preload_images': True,             # Preload images from hard drive or keep reloading ?
        'gdrive_data_id': cnst.MONTE_DATA_GDRIVE_ID,
        'data_dir': cnst.DATA_DIR,
        'img_dir': cnst.MIKE_IMG_DIR,
        'traj_dir': cnst.MIKE_TRAJECTORIES_DIR,
        'stat_dir': cnst.MIKE_STATES_DIR,
        'batch_size': params.expert_nbatch,
        'max_score_cutoff': params.exp_max_score,  # What is maximum expert score we can show? Used to cut expert data
        'min_score_cutoff': 20000,                 # What is minimum score to count trajectory as expert
        'process_lost_lifes': True,                # Should loss of life zero future discounted reward?
        'use_n_trajectories': params.use_n_trajectories if 'use_n_trajectories' in params else None
    }

    the_seed = np.random.randint(10000)
    print(80 * "SEED")
    print("Today's lucky seed is {}".format(the_seed))
    print(80 * "SEED")

    env = VecFrameStack(
        make_atari_env(
            env_id=params.env,
            num_env=params.num_env,
            seed=the_seed,
            limit_len=params.limit_len,
            limit_penalty=params.limit_penalty,
            death_penalty=params.death_penalty,
            step_penalty=params.step_penalty,
            random_state_reset=params.random_state_reset,
            dataflow_config=dataflow_config
        ),
        params.frame_stack
    )

    learn(
        policy=policy_fn,
        env=env,
        seed=the_seed,
        params=params,
        dataflow_config=dataflow_config,
        expert_nbatch=params.expert_nbatch,
        exp_adv_est=params.exp_adv_est,
        load_model=params.load_model,
        gamma=params.gamma,
        nprocs=params.num_env,
        nsteps=params.nsteps,
        ent_coef=params.ent_coef,
        expert_coeff=params.exp_coeff,
        lr=params.lr,
        lrschedule=params.lrschedule,
    )

    env.close()
from baselines.common.cmd_util import make_atari_env
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from arguments import achieve_arguments
from a2c_agent import a2c_agent
from baselines import logger

if __name__ == '__main__':
    args = achieve_arguments()
    logger.configure(dir=args.log_dir)
    # create environments
    env_args = {'episode_life': False, 'clip_rewards': False}
    envs = VecFrameStack(
        make_atari_env(args.env_name,
                       args.num_processes,
                       args.seed,
                       wrapper_kwargs=env_args), 4)
    trainer = a2c_agent(envs, args)
    trainer.learn()
    envs.close()
Esempio n. 28
0
def main(env_name,
         mode,
         episodes,
         random_sample,
         save_path,
         concrete=False,
         expert_first=False,
         save_model=True,
         dropout=0.05,
         lr=0.001,
         ls=5e-7,
         train_epochs=10,
         density=0.0,
         hetero_loss=False,
         budget=1):
    """
    env_name - gym environment [LunarLander-v2, CartPole-v1]
    mode - learning type [pool, stream, classic]
    save_path - where the model and tf loggin data should be saved to
    """
    seed = random.randint(0, 1e6)
    isSpace = env_name[:5] == 'Space'
    if isSpace:
        wrapper_kwargs = {'episode_life': False}
        env = VecFrameStack(
            make_atari_env(env_name, 1, 0, wrapper_kwargs=wrapper_kwargs), 4)
    else:
        env = gym.make(env_name)
        env.seed(seed)

    isFetch = env_name[:5] == 'Fetch'
    if isFetch:  # That's so fetch
        from active_imitation.agents.mujoco_robot import DEFAULT_PARAMS
        action_size = env.action_space.shape[0]
        observation_size = env.observation_space.spaces['observation'].shape
        goal_size = env.observation_space.spaces['desired_goal'].shape[0]
        env_dims = {
            'observation': observation_size,
            'goal': goal_size,
            'action': action_size
        }
    elif isSpace:
        from active_imitation.agents.classic_gym import DEFAULT_PARAMS
        action_size = 1
        action_space = env.action_space.n
        observation_size = env.observation_space.shape
        env_dims = {
            'observation': observation_size,
            'action': action_size,
            'action_space': action_space
        }
    else:
        from active_imitation.agents.classic_gym import DEFAULT_PARAMS
        # Need the spaces dimensions to initialize the NN agent
        action_size = 1  # Single, discrete actions
        action_space = env.action_space.n
        observation_size = env.observation_space.shape
        env_dims = {
            'observation': observation_size,
            'action': action_size,
            'action_space': action_space
        }

    # Change the dimensions of the nn layers
    params = DEFAULT_PARAMS

    # params['layers'] = [64, 64, 64]
    params['dropout_rate'] = dropout  #[0.05, 0.1, 0.15, 0.2]
    params['filepath'] = save_path
    params['lr'] = lr
    params['hetero_loss'] = hetero_loss
    if isFetch or isSpace:
        params['layers'] = [256, 256, 256]  #[512, 512, 512] #
        params['concrete'] = concrete
        params['ls'] = ls
    else:
        params['layers'] = [16, 16, 16]
        params['concrete'] = concrete

    if expert_first:
        mixing = 0.0
        mixing_decay = 1.0
    else:
        mixing = 1.0
        mixing_decay = 1.0

    param_mods = {
        'random_sample': random_sample,
        'mixing': mixing,
        'density_weight': density,
        'budget': budget
    }

    if isFetch:
        agent = GymRobotAgent(env_dims, **params)
        expert = RoboticEnv_Expert(policy_files[env_name])
        continuous = True
    elif isSpace:
        expert = SpaceInvadersExpert({
            'observation': env.observation_space,
            'action': env.action_space
        })
        agent = AtariGymAgent(env_dims, **params)
        continuous = False
        param_mods['isSpace'] = True
    else:
        agent = GymAgent(env_dims, **params)
        expert = experts[env_name](env.unwrapped)
        continuous = False

    learning_mode = configure.configure_robot(env,
                                              env_dims,
                                              agent,
                                              expert,
                                              mode,
                                              continuous=continuous,
                                              concrete=concrete,
                                              param_mods=param_mods)

    ## Save the training parameters
    # learning rate, dropout, isconcrete, iscontinuout, env_name, mode,
    parameter_savefile = os.path.join(save_path, 'parameters.txt')
    with open(parameter_savefile, 'w') as f:
        f.write('Environment Name: {} \n'.format(env_name))
        f.write('Learning Mode: {} \n'.format(mode))
        f.write('# of Episodes: {} \n'.format(episodes))
        f.write('Learning Rate:{} \n'.format(lr))
        f.write('Concrete Length Scale: {} \n'.format(ls))
        f.write('Training Epochs: {}\n'.format(train_epochs))
        f.write('Continuous: {}\n'.format(continuous))
        f.write('Concrete: {}\n'.format(concrete))
        f.write('Random Sample: {}\n'.format(random_sample))
        f.write('Mixing: {}\n'.format(mixing))
        f.write('Mixing Decay: {}\n'.format(mixing_decay))
        f.write('Density Weighting: {}\n'.format(density))
        f.write('Budget: {}\n'.format(budget))
        for label, value in params.items():
            f.write('{}: {}\n'.format(label, value))
        f.write('Random Seed: {}\n'.format(seed))

    if isSpace:
        save_rate = 5000
        valid_runs = 1
    elif isFetch:
        save_rate = 100
        valid_runs = 5
    else:
        save_rate = 100
        valid_runs = 5
    rewards, stats = learning_mode.train(
        episodes=episodes,
        mixing_decay=mixing_decay,
        train_epochs=train_epochs,
        save_images=False,
        image_filepath=save_path + 'images/',
        save_rate=save_rate,
        valid_runs=valid_runs,
    )
    if save_model:
        agent.save_model()

    if isSpace:
        expert.close()

    agent.sess.close()
    env.close()
    tf.reset_default_graph()

    return rewards, stats
Esempio n. 29
0
def train(env_id, model, num_envs, num_timesteps, lrschedule, save_interval,
          seed):

    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.Session(config=config).__enter__()

    print("Starting experiment")

    # Level selector
    #level_path = './results/' + experiment_name + '/levels/' + experiment_id + '/'
    #level_selector = LevelSelector.get_selector(args.selector, args.game, level_path)

    # Make gym environment
    #env = make_gvgai_env(env_id=env_id,
    #                     num_env=args.num_envs,
    #                     seed=args.seed,
    #                     level_selector=level_selector)
    env = VecFrameStack(make_gvgai_env(env_id, num_envs, seed), 4)

    # Select model
    policy = {
        'cnn': CnnPolicy,
        'lstm': LstmPolicy,
        'lnlstm': LnLstmPolicy,
        'mlp': MlpPolicy
    }[model]

    #Philip: how to resume?, lrschedule is not used yet
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=4,
               lam=0.95,
               gamma=0.99,
               noptepochs=4,
               log_interval=1,
               save_interval=save_interval,
               ent_coef=.01,
               lr=lambda f: f * 2.5e-4,
               cliprange=lambda f: f * 0.1,
               total_timesteps=int(num_timesteps * 1.1))

    #Verify there are no features here that I still want
    #learn(policy=policy,
    #      env=env,
    #      experiment_name=experiment_name,
    #      experiment_id=experiment_id,
    #      seed=args.seed,
    #      total_timesteps=args.num_timesteps,
    #      lrschedule=args.lrschedule,
    #      frame_skip=False,
    #      save_interval=args.save_interval,
    #      level_selector=level_selector,
    #      render=args.render)

    env.close()

    print("Experiment DONE")
Esempio n. 30
0
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from models import CNN_Net
import torch
import os


# get the tensors
def get_tensors(obs):
    return torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32)


if __name__ == '__main__':
    args = get_args()
    # create the environment
    env = VecFrameStack(make_atari_env(args.env_name, 1, args.seed), 4)
    # start to create the model
    model_path = args.save_dir + args.env_name + '/model.pt'
    network = CNN_Net(env.action_space.n)
    network.load_state_dict(
        torch.load(model_path, map_location=lambda storage, loc: storage))
    # start to do the test
    obs = env.reset()
    for _ in range(10000):
        env.render()
        obs_tensor = get_tensors(obs)
        with torch.no_grad():
            _, pi = network(obs_tensor)
        actions = torch.argmax(pi, dim=1).item()
        obs, reward, done, _ = env.step([actions])
    env.close()
Esempio n. 31
0
def train(env_id, num_timesteps, seed, num_cpu):
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()