Ejemplo n.º 1
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  device,
                  allow_early_resets,
                  num_frame_stack=None,
                  coin_run_level=0,
                  coin_run_seed=-1,
                  difficulty=False):
    # coinrun environments need to be treated differently.
    coinrun_envs = {
        'CoinRun': 'standard',
        'CoinRun-Platforms': 'platform',
        'Random-Mazes': 'maze'
    }
    if env_name in coinrun_envs:
        coin_run_args = setup_utils.setup_and_load(use_cmd_line_args=False)
        Coinrun_Config.GAME_TYPE = coinrun_envs[env_name]
        Coinrun_Config.NUM_LEVELS = coin_run_level
        Coinrun_Config.SET_SEED = coin_run_seed
        # If SET_SEED = -1, this seed is not used and level seeds will be drawn from the
        # range [0, NUM_LEVELS). Use SET_SEED = -1 and NUM_LEVELS = 500 to train with the same levels in the paper.
        Coinrun_Config.NUM_ENVS = num_processes
        Coinrun_Config.HIGH_DIFFICULTY = difficulty
        envs = coinrun_utils.make_general_env(num_processes)
        envs.spec = Coinrun_Config.GAME_TYPE
        envs = CoinRunVecPyTorch(envs, device)
        envs = add_final_pytorch_wrappers(envs)

    else:
        envs = [
            make_env(env_name, seed, i, log_dir, allow_early_resets)
            for i in range(num_processes)
        ]

        if len(envs) > 1:
            envs = ShmemVecEnv(envs, context='fork')
        else:
            envs = DummyVecEnv(envs)

        if len(envs.observation_space.shape) == 1:
            if gamma is None:
                envs = VecNormalize(envs, ret=False)
            else:
                envs = VecNormalize(envs, gamma=gamma)

        envs = VecPyTorch(envs, device)

        if num_frame_stack is not None:
            envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
        elif len(envs.observation_space.shape) == 3:
            envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Ejemplo n.º 2
0
def main():
    args = setup_utils.setup_and_load()
    setup_utils.load_for_setup_if_necessary()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes=" baseline train",
               tags=["baseline", Config.RUN_ID.split('-')[0]],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()
    utils.mpi_print('Set up gpu')
    utils.mpi_print(args)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs is how many envs run parallel on a cpu
    # VenEnv class allows parallel rollout
    nenvs = Config.NUM_ENVS
    total_timesteps = int(256 * 10**6)

    env = utils.make_general_env(nenvs, seed=rank)
    utils.mpi_print('Set up env')

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = policies_back.get_policy()
        #policy = policies.get_policy()
        utils.mpi_print('Set up policy')

        learn_func(policy=policy,
                   env=env,
                   log_interval=args.log_interval,
                   save_interval=args.save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=Config.GAE_LAMBDA,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   ent_coef=Config.ENTROPY_COEFF,
                   vf_coef=Config.VF_COEFF,
                   max_grad_norm=Config.MAX_GRAD_NORM,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * Config.CLIP_RANGE,
                   total_timesteps=total_timesteps)
Ejemplo n.º 3
0
def main():
    args = setup_utils.setup_and_load(num_levels=250,
                                      starting_level=0,
                                      paint_vel_info=1,
                                      run_id='start0numlev250_256mts_dann_low',
                                      num_envs=32)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    #config = tf.ConfigProto()
    frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options)
    nogpu_config = tf.ConfigProto(device_count={'GPU': 0})
    #config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    print("Num envs: " + str(Config.NUM_ENVS))
    total_timesteps = int(256e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=frac_gpu_config):
        #with tf.Session(config=nogpu_config):
        env = wrappers.add_final_wrappers(env)

        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                   env=env,
                   save_interval=save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=0.95,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   log_interval=1,
                   ent_coef=Config.ENTROPY_COEFF,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * 0.2,
                   total_timesteps=total_timesteps)
Ejemplo n.º 4
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes="network randomization",
               tags=["baseline"],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(256e6)

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = nr_policies.get_policy()

        nr_ppo2.learn(policy=policy,
                      env=env,
                      save_interval=args.save_interval,
                      nsteps=Config.NUM_STEPS,
                      nminibatches=Config.NUM_MINIBATCHES,
                      lam=0.95,
                      gamma=Config.GAMMA,
                      noptepochs=Config.PPO_EPOCHS,
                      log_interval=1,
                      ent_coef=Config.ENTROPY_COEFF,
                      lr=lambda f: f * Config.LEARNING_RATE,
                      cliprange=lambda f: f * 0.2,
                      total_timesteps=total_timesteps)
Ejemplo n.º 5
0
    def create_saliency(model_idx, sess):
        graph = tf.get_default_graph()
        env = utils.make_general_env(1)
        env = wrappers.add_final_wrappers(env)
        agent = create_act_model(sess, env, 1)
        action_selector = tf.placeholder(tf.int32)
        gradient_saliency = saliency.GradientSaliency(graph, sess, agent.pd.logits[0][action_selector], agent.X)
        sess.run(tf.compat.v1.global_variables_initializer())

        # setup_utils.restore_file(models[model_idx])
        try:
            loaded_params = utils.load_params_for_scope(sess, 'model')
            if not loaded_params:
                print('NO SAVED PARAMS LOADED')
        except AssertionError as e:
            models[model_idx] = None
            return [None]*3
        return agent, gradient_saliency, action_selector
Ejemplo n.º 6
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    main_utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank))
    print("load path:")
    print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))
    act = deepq.learn(
        env,
        network="conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        total_timesteps=0,
        load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH,
                                                  Config.RUN_ID)
        # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID)
    )

    num_episodes = 500
    # while True:
    episode_rew_ls = []
    for i in range(num_episodes):
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if Config.RENDER:
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        episode_rew_ls.append(episode_rew)
        print("Episode reward", episode_rew)
    print("Avg episode reward", np.mean(episode_rew_ls))
    print("Var episode reward", np.std(episode_rew_ls))
Ejemplo n.º 7
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(200e6)
    elif Config.SHORT_TRAINING:
        total_timesteps = int(120e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        
        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                    env=env,
                    save_interval=save_interval,
                    nsteps=Config.NUM_STEPS,
                    nminibatches=Config.NUM_MINIBATCHES,
                    lam=0.95,
                    gamma=Config.GAMMA,
                    noptepochs=Config.PPO_EPOCHS,
                    log_interval=1,
                    ent_coef=Config.ENTROPY_COEFF,
                    lr=lambda f : f * Config.LEARNING_RATE,
                    cliprange=lambda f : f * 0.2,
                    total_timesteps=total_timesteps)
Ejemplo n.º 8
0
def enjoy_env_sess(sess):
    should_render = True
    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    rep_count = Config.REP

    if should_eval:
        env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    agent = create_act_model(sess, env, nenvs)

    sess.run(tf.global_variables_initializer())
    loaded_params = utils.load_params_for_scope(sess, 'model')

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    while should_continue():
        action, values, state, _ = agent.step(obs, state, done)
        obs, rew, done, info = env.step(action)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        if t_step % 100 == 0:
            mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0],
                      np.shape(obs))

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = 0

    if should_eval:
        mean_score = np.mean(scores) / rep_count
        max_idx = np.argmax(scores)
        mpi_print('scores', scores / rep_count)
        print('mean_score', mean_score)
        mpi_print('max idx', max_idx)

        mpi_mean_score = utils.mpi_average([mean_score])
        mpi_print('mpi_mean', mpi_mean_score)

        result = mean_score

    return result
Ejemplo n.º 9
0
def main():
    print("line 1...")
    args = setup_utils.setup_and_load()
    print("passed line 1...")
    comm = MPI.COMM_WORLD
    print("passed line 2...")
    rank = comm.Get_rank()  #the rank of the process in a communicator
    print("passed line 3...")
    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)
    print("passed line 4,5...")
    # utils.setup_mpi_gpus()
    print("passed line 6...")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs = Config.NUM_ENVS
    nenvs = 1  #set to 1 temporarily
    # frame_stack_size = Config.FRAME_STACK_SIZE
    frame_stack_size = 1
    total_timesteps = int(5e6)
    save_interval = args.save_interval

    env_id = "MsPacman-v0"

    #copy from https://github.com/openai/baselines/blob/52255beda5f5c8760b0ae1f676aa656bb1a61f80/baselines/run.py#L33
    _game_envs = defaultdict(set)
    for env in gym.envs.registry.all():
        # TODO: solve this with regexes
        env_type = env._entry_point.split(':')[0].split('.')[-1]
        _game_envs[env_type].add(env.id)

    # env = make_vec_env(env_id, env_type, nenvs, seed, gamestate=args.gamestate, reward_scale=args.reward_scale)
    """
    save_interval = args.save_interval
    
    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        
        policy = policies.get_policy()
    """
    env = utils.make_general_env(env_id, env_type, nenvs, seed)
    # env = make_vec_env(env_id, env_type, nenvs, seed)
    # env = VecFrameStack(env, frame_stack_size)

    # env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        # env = wrappers.add_final_wrappers(env) #don't use wrappers anymore
        env = wrappers.add_final_wrappers(env)
        policy = policies.get_policy()

        # ppo2.learn(policy=policy,
        #             env=env,
        #             save_interval=save_interval,
        #             nsteps=Config.NUM_STEPS,
        #             nminibatches=Config.NUM_MINIBATCHES,
        #             lam=0.95,
        #             gamma=Config.GAMMA,
        #             noptepochs=Config.PPO_EPOCHS,
        #             log_interval=1,
        #             ent_coef=Config.ENTROPY_COEFF,
        #             lr=lambda f : f * Config.LEARNING_RATE,
        #             cliprange=lambda f : f * 0.2,
        #             total_timesteps=total_timesteps)
        ppo2.learn(policy=policy,
                   env=env,
                   save_interval=save_interval,
                   nsteps=int(1000),
                   nminibatches=100,
                   lam=0.95,
                   gamma=0.9,
                   noptepochs=16,
                   log_interval=1,
                   ent_coef=0.1,
                   lr=lambda f: f * 3e-4,
                   cliprange=lambda f: f * 0.2,
                   total_timesteps=total_timesteps)
def main():
    # set seed
    args = setup_utils.setup_and_load()
    print("args are")
    print(args)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = 0
    set_global_seeds(seed * 100 + rank)

    # Initialize env
    nenvs = 1
    env_init_size = nenvs
    env = utils.make_general_env(nenvs, seed=rank)

    # check and use GPU if available if not use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device is {}".format(device))

    # wrap env (not needed with Coinrun options)
    #env = dqn_utils.wrap_deepmind(env, clip_rewards=False, frame_stack=True, scale=False)
    action_size = env.action_space.n

    #env.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    # hyperparameters
    timesteps = 1000000  #2000000 #1000#2000000  # run env for this many time steps
    hidden_size = 512  # side of hidden layer of FFNN that connects CNN to outputs
    learning_rate = 0.0001  # learning rate of optimizer
    batch_size = 32  # size of batch trained on
    #start_training_after = 50 #10000  # start training NN after this many timesteps
    discount = 0.99  # discount future states by
    is_dueling = True
    is_impala_net = False

    frame_skip = 4  #hold action for this many frames

    # create DQN Agent
    dqn_agent = dqn_utils.DQNAgent(action_size, hidden_size, learning_rate,
                                   is_dueling, is_impala_net)

    train_time_id = 1587836568
    test_time_id = int(time.time())
    load_array = []
    for i in range(20000, 250000, 20000):
        load_array.append(i)
    load_array.append('FINAL')
    # load agent
    for i in load_array:
        model_number = i
        PATH = "saved_models/dqn_model_{}_{}.pt".format(
            train_time_id, model_number)
        dqn_agent.train_net.load_state_dict(torch.load(PATH))
        dqn_agent.train_net.eval()

        # training loop
        stats_rewards_list = []  # store stats for plotting in this
        stats_every = 1  # print stats every this many episodes
        total_reward = 0.
        episode = 1
        episode_length = 0
        stats_loss = 0.
        epsilon = 0.02
        # can't call env.reset() in coinrun, so start each episode with a no acton
        state_list, _, _, _ = env.step(np.array([0], dtype=np.int32))

        for ts in range(timesteps):
            # select an action from the agent's policy
            action = dqn_agent.select_action(state_list[0].squeeze(axis=-1),
                                             epsilon, env, batch_size)

            # enter action into the env
            for _ in range(frame_skip):
                next_state_list, reward_list, done_list, _ = env.step(action)
                total_reward += reward_list[0]
                if done_list[0]:
                    break
            done = done_list[0]
            episode_length += 1

            if done:
                state_list = env.reset()

                stats_rewards_list.append(
                    (episode, total_reward, episode_length))
                episode += 1
                if episode >= 201:
                    break
                if episode % stats_every == 0:
                    print(
                        'Episode: {}'.format(episode),
                        'Timestep: {}'.format(ts),
                        'Episode reward {}'.format(total_reward),
                        'Episode len {}'.format(episode_length),
                        'Mean reward: {:.1f}'.format(
                            np.mean(stats_rewards_list,
                                    axis=0)[1]), 'Mean length: {:.1f}'.format(
                                        np.mean(stats_rewards_list,
                                                axis=0)[2]))
                    stats_loss = 0.

                total_reward = 0
                episode_length = 0
            else:
                state_list = next_state_list

        #save final stats
        stats_save_string = "saved_models/test_env_stats_{}_{}_{}.pickle".format(
            train_time_id, model_number, test_time_id)
        with open(stats_save_string, 'wb') as handle:
            pickle.dump(stats_rewards_list, handle)
Ejemplo n.º 11
0
def enjoy_env_sess(sess, DIR_NAME):
    should_render = True
    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    rep_count = Config.REP

    file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID)
    f_io = open(file_name, 'a')

    if should_eval:
        if Config.TEST_NUM_EVAL > -1:
            env = utils.make_general_env(Config.TEST_NUM_EVAL)
        else:
            env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    vae = ConvVAE(z_size=Config.VAE_Z_SIZE,
                  batch_size=nenvs,
                  is_training=False,
                  reuse=False,
                  gpu_mode=True,
                  use_coord_conv=True)
    agent = create_act_model(sess, env, nenvs, Config.VAE_Z_SIZE)
    num_actions = env.action_space.n

    init_rand = tf.variables_initializer(
        [v for v in tf.global_variables() if 'randcnn' in v.name])
    sess.run(tf.compat.v1.global_variables_initializer())

    soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions],
                                name='soft_numpy')
    dist = tfp.distributions.Categorical(probs=soft_numpy)
    sampled_action = dist.sample()

    loaded_params = utils.load_params_for_scope(sess, 'model')
    vae.load_json_full(Config.VAE_PATH)

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    actions = [env.action_space.sample() for _ in range(nenvs)]
    actions = np.array(actions)
    obs, _, _, _ = env.step(actions)

    sess.run(init_rand)
    while should_continue():

        #scipy.misc.imsave('raw_inputs.png', obs[0])
        encoder_in = obs.astype(np.float32) / 255.0
        batch_z = vae.encode(encoder_in)
        #reconstruct = vae.decode(batch_z)
        #scipy.misc.imsave('recon.png', reconstruct[0])

        action, values, state, _ = agent.step(batch_z, state, done)
        obs, rew, done, info = env.step(action)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = 0

    if should_eval:
        mean_score = np.mean(scores) / rep_count
        max_idx = np.argmax(scores)

        result = mean_score

        f_io.write("{}\n".format(result))
        f_io.close()

    return result
Ejemplo n.º 12
0
def main(sess):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000

    if Config.EXTRACT_SEED != -1:
        seed = Config.EXTRACT_SEED
    if Config.EXTRACT_RANK != -1:
        rank = Config.EXTRACT_RANK

    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    use_policy = (Config.RESTORE_ID != '')

    nenvs = Config.NUM_ENVS
    total_timesteps = int(502e6)
    env = utils.make_general_env(nenvs, seed=rank)

    if use_policy:
        agent = create_act_model(sess, env, nenvs)
        sess.run(tf.compat.v1.global_variables_initializer())
        loaded_params = utils.load_params_for_scope(sess, 'model')
        if not loaded_params:
            print('NO SAVED PARAMS LOADED')

    # make directory
    DIR_NAME = './VAE/records/'
    if not os.path.exists(DIR_NAME):
        os.makedirs(DIR_NAME, exist_ok=True)
    
    # set file name
    filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz"
    
    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
        obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
        obs[:] = env.reset()
        dones = [False for _ in range(nenv)]
        
        # remove noisy inputs
        actions = [env.action_space.sample() for _ in range(nenv)]
        actions = np.array(actions)
        obs[:], rewards, dones, _ = env.step(actions)
        state = agent.initial_state
        
        mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[]
        # For n in range number of steps
        for _ in range(400):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            if use_policy:
                actions, _, _, _ = agent.step(obs, state, dones)
            else:
                actions = [env.action_space.sample() for _ in range(nenv)]
            actions = np.array(actions)
            mb_obs.append(obs.copy())
            mb_actions.append(actions)
            mb_dones.append(dones)
            
            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            obs[:], rewards, dones, _ = env.step(actions)
            mb_next_obs.append(obs.copy())
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=obs.dtype)
        mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        
        #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones)
        np.savez_compressed(filename, obs=mb_obs)
        return filename
Ejemplo n.º 13
0
 def make_env():
     env = utils.make_general_env(nenvs, seed=rank)
     env = wrappers.add_final_wrappers(env)
     return env
Ejemplo n.º 14
0
def enjoy_env_sess(sess, checkpoint, overlap):
    #base_name = str(8*checkpoint)  + 'M'
    #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name)
    should_eval = True
    mpi_print('test levels seed', Config.SET_SEED)
    mpi_print('test levels ', Config.NUM_LEVELS)
    rep_count = 50

    env = utils.make_general_env(20)
    env = wrappers.add_final_wrappers(env)
    nenvs = env.num_envs

    sess.run(tf.global_variables_initializer())
    args_now = Config.get_args_dict()
    #args_run = utils.load_args()
    agent = create_act_model(sess, env, nenvs)

    # load name is specified by config.RESTORE_ID adn return True/False
    if checkpoint != 32:
        base_name = str(8 * checkpoint) + 'M'
    elif checkpoint == 0:
        mean_score = 0.0
        succ_rate = 0.0
        wandb.log({
            'Rew_mean': mean_score,
            'Succ_rate': succ_rate,
            'Step_elapsed': steps_elapsed
        })
        return mean_score, succ_rate
    else:
        base_name = None

    sess.run(tf.global_variables_initializer())
    # env init here
    load_file = setup_utils.restore_file(Config.RESTORE_ID,
                                         overlap_config=overlap,
                                         base_name=base_name)

    is_loaded = utils.load_params_for_scope(sess, 'model')
    if not is_loaded:
        mpi_print('NO SAVED PARAMS LOADED')
        return mean_score, succ_rate

    obs = env.reset()
    t_step = 0

    scores = np.zeros((nenvs, rep_count))
    eplens = np.zeros((nenvs, rep_count))
    #scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)

    # curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    def rollout(obs, state, done):
        """rollout for rep * nenv times and return scores"""
        t = 0
        count = 0
        rews = np.zeros((nenvs, rep_count))
        while should_continue():
            action, values, state, _ = agent.step(obs, state, done)
            obs, rew, done, info = env.step(action)
            rews[:, count] += rew
            t += 1

            for i, d in enumerate(done):
                if d:
                    eplens[i][count] = t
                    if score_counts[i] < rep_count:
                        score_counts[i] += 1
                        count = score_counts[i] - 1
                        # aux score
                        if 'episode' in info[i]:
                            scores[i][count] = info[i].get('episode')['r']

        return scores, rews, eplens

    if is_loaded:
        mpi_print(load_file)
        scores, rews, eplens = rollout(obs, state, done)

    size = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    if size == 1:
        if rank == 0:
            testset_size = rep_count * nenvs
            utils.save_pickle(scores, Config.LOGDIR + 'scores')
            mean_score = np.sum(scores) / testset_size
            succ_rate = np.sum(scores == 10.0) / testset_size
            mpi_print('cpus ', size)
            mpi_print('testset size', testset_size)
            # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs
            # each one has a new seed(maybe counted)
            # mpi_print('score detail',scores.flatten())
            mpi_print('succ_rate', succ_rate)
            steps_elapsed = checkpoint * 8000000
            mpi_print('steps_elapsed:', steps_elapsed)
            mpi_print('mean score', mean_score)
            wandb.log({
                'Rew_mean': mean_score,
                'Succ_rate': succ_rate,
                'Step_elapsed': steps_elapsed
            })
            #mpi_print('mean score of each env',[np.mean(s) for s in scores])
    else:
        testset_size = rep_count * nenvs
        succ = np.sum(scores=10.0) / testset_size
        succ_rate = utils.mpi_average([succ])
        mean_score_tmp = np.sum(scores) / testset_size
        mean_score = utils.mpi_average([mean_score_tmp])
        if rank == 0:
            mpi_print('testset size', rep_count * nenvs * size)
            mpi_print('load file name', load_file)
            mpi_print('testset size', testset_size)
            # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs
            # each one has a new seed(maybe counted)
            # mpi_print('score detail',scores.flatten())
            mpi_print('succ_rate', succ_rate)
            mpi_print('mean score', mean_score)
            wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate})

    return mean_score, succ_rate
Ejemplo n.º 15
0
def test(sess, load_path, env, should_render=False, rep_count=Config.REP):
    rank = MPI.COMM_WORLD.Get_rank()
    size = MPI.COMM_WORLD.Get_size()

    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    if should_eval:
        #env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    model = load_model(sess, filename)

    agent = create_act_model(sess, env, nenvs)

    sess.run(tf.global_variables_initializer())
    loaded_params = utils.load_params_for_scope(sess, 'model')

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    while should_continue():
        action, values, state, _ = agent.step(obs, state, done)
        obs, rew, done, info = env.step(action)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        if t_step % 100 == 0:
            mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0],
                      np.shape(obs))

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = {
        'steps_elapsed': steps_elapsed,
    }

    if should_eval:
        testset_size = rep_count * nenvs
        mean_score = np.sum(scores) / testset_size
        succ_rate = np.sum(scores == 10.0) / testset_size
        max_idx = np.argmax(scores)
        mpi_print('max idx', max_idx)
        mpi_print('steps_elapsed', steps_elapsed)
        if size > 1:
            mean_score = utils.mpi_average([mean_score])
        mpi_print('mpi_mean', mpi_mean_score)
        wandb.log({'Test_Rew_mean': mean_score, 'Test_Succ_rate': succ_rate})
        result['scores'] = scores
        result['testset_size'] = testset_size
        result['test_rew_mean'] = mean_score
        result['test_succ_rate'] = succ_rate

    return result
def main():
    # check and use GPU if available if not use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device is {}".format(device))

    # arguments
    args = setup_utils.setup_and_load()
    print("Arguments are")
    print(args)

    # set seed
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    time_int = int(time.time())
    seed = time_int % 10000
    set_global_seeds(seed * 100 + rank)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    if Config.NUM_ENVS > 1:
        print("To do: add multi env support")
    nenvs = 1  #Config.NUM_ENVS
    env = utils.make_general_env(nenvs, seed=rank)

    # wrap env (not needed with Coinrun options)
    #env = dqn_utils.wrap_deepmind(env, clip_rewards=False, frame_stack=True, scale=False)
    action_size = env.action_space.n

    # set up pysyft workers
    num_workers = 2
    hook = sy.TorchHook(torch)
    worker_1 = sy.VirtualWorker(hook, id='worker_1')
    worker_2 = sy.VirtualWorker(hook, id='worker_2')
    secure_worker = sy.VirtualWorker(hook, id='secure_worker')
    worker_list = []
    worker_list.append(worker_1)
    worker_list.append(worker_2)

    # Training hyperparameters
    timesteps = 250000  #2000000 #1000#2000000  # run env for this many time steps
    hidden_size = 512  # side of hidden layer of FFNN that connects CNN to outputs
    is_dueling = True
    is_impala_net = False
    learning_rate = 0.0001  # learning rate of optimizer
    batch_size = 32  # size of batch trained on
    start_training_after = 10000  # start training NN after this many timesteps
    discount = 0.99  # discount future states by

    epsilon_start = 1.0  # epsilon greedy start value
    epsilon_min = 0.02  # epsilon greedy end value
    epsilon_decay_steps = timesteps * .5  # decay epsilon over this many timesteps
    epsilon_step = (epsilon_start - epsilon_min) / (
        epsilon_decay_steps)  # decrement epsilon by this amount every timestep

    update_target_every = 1  # update target network every this steps
    tau = 0.001  # soft target updating amount

    frame_skip = 4  #hold action for this many frames
    save_every = 10000  #timesteps to save model after
    train_every = 1  # number of times to train

    # create replay buffer
    replay_size = 50000  # size of replay buffer
    replay_buffer_list = []
    for i in range(num_workers):
        replay_buffer = dqn_utils.ReplayBuffer(max_size=replay_size)
        replay_buffer_list.append(replay_buffer)

    # create DQN Agent
    dqn_agent = dqn_utils.DQNAgent(action_size, hidden_size, learning_rate,
                                   is_dueling, is_impala_net)

    # create states for every env
    stats_every = 10  #10  # print stats every this many episodes
    stats_list = []  # store stats for each env init here
    for i in range(num_workers):
        temp_dict = {}
        temp_dict['episode'] = 0
        temp_dict['mean_reward_total'] = 0.
        temp_dict['mean_ep_length_total'] = 0.
        temp_dict['mean_reward_recent'] = 0.
        temp_dict['mean_ep_length_recent'] = 0.
        temp_dict['episode_loss'] = 0.
        temp_dict['episode_reward'] = 0.
        temp_dict['episode_length'] = 0.
        stats_list.append(temp_dict)

    # training loop
    epsilon = epsilon_start
    # take no_action on first step to get state
    # use state to tell which level
    # env.reset() does not produce and observation in CoinRun until an action is taken
    no_action = np.zeros((nenvs, ), dtype=np.int32)
    state_list, _, _, _ = env.step(no_action)

    # assign each level to a worker
    # coinrun doesn't have a way to tell the current level so take mean of first screen of level and use dictionary to assign levels
    # worker_level is used to tell which replay buffer to put data into (ie which worker is training)
    level_worker_dict = {}
    levels_assigned = 0

    def get_worker_level(state, lw_dict, la, nw):
        temp_key = int(1000 * np.mean(state))
        if temp_key not in lw_dict:
            la += 1
            lw_dict[temp_key] = la % nw
            print("Adding new key to level_worker_dict. current size is: {}".
                  format(len(lw_dict)))
            print(lw_dict)
        return lw_dict[temp_key], lw_dict, la

    worker_level, level_worker_dict, levels_assigned = get_worker_level(
        state_list[0], level_worker_dict, levels_assigned, num_workers)

    for ts in range(timesteps):
        # decay epsilon
        epsilon -= epsilon_step
        if epsilon < epsilon_min:
            epsilon = epsilon_min

        # select an action from the agent's policy
        action = dqn_agent.select_action(state_list[0].squeeze(axis=-1),
                                         epsilon, env, batch_size)

        # enter action into the env
        reward_frame_skip = 0.
        for _ in range(frame_skip):
            next_state_list, reward_list, done_list, _ = env.step(action)
            stats_list[worker_level]['episode_reward'] += reward_list[0]
            reward_frame_skip += reward_list[0]
            if done_list[0]:
                break
        done = done_list[0]
        stats_list[worker_level]['episode_length'] += 1

        # add experience to replay buffer
        replay_buffer_list[worker_level].add(
            (state_list[0].squeeze(axis=-1),
             next_state_list[0].squeeze(axis=-1), action, reward_frame_skip,
             float(done)))

        if done:
            # env.reset doesn't reset the coinrun env but does produce image of first frame, which we can use get the worker_level
            state_list = env.reset()
            worker_level, level_worker_dict, levels_assigned = get_worker_level(
                state_list[0], level_worker_dict, levels_assigned, num_workers)

            #update stats
            stats_list[worker_level]['episode'] += 1
            #overall averages
            stats_list[worker_level]['mean_reward_total'] = (stats_list[worker_level]['mean_reward_total'] * (
                        stats_list[worker_level]['episode'] - 1) + stats_list[worker_level]['episode_reward']) / \
                                                            stats_list[worker_level]['episode']
            stats_list[worker_level]['mean_ep_length_total'] = (stats_list[worker_level]['mean_ep_length_total'] * (
                        stats_list[worker_level]['episode'] - 1) + stats_list[worker_level]['episode_length']) / \
                                                               stats_list[worker_level]['episode']
            # keep running average of last stats_every episodes
            if stats_list[worker_level]['episode'] >= stats_every:
                temp_episodes_num = stats_every
            else:
                temp_episodes_num = stats_list[worker_level]['episode']
            stats_list[worker_level]['mean_reward_recent'] = (
                stats_list[worker_level]['mean_reward_recent'] *
                (temp_episodes_num - 1) +
                stats_list[worker_level]['episode_reward']) / temp_episodes_num
            stats_list[worker_level]['mean_ep_length_recent'] = (
                stats_list[worker_level]['mean_ep_length_recent'] *
                (temp_episodes_num - 1) +
                stats_list[worker_level]['episode_length']) / temp_episodes_num
            # reset episode stats
            stats_list[worker_level]['episode_reward'] = 0.
            stats_list[worker_level]['episode_length'] = 0

            # print stats
            if stats_list[worker_level]['episode'] % stats_every == 0:
                print(
                    'w: {}'.format(worker_level),
                    'epi: {}'.format(stats_list[worker_level]['episode']),
                    't: {}'.format(ts), 'r: {:.1f}'.format(
                        stats_list[worker_level]['mean_reward_total']),
                    'l: {:.1f}'.format(
                        stats_list[worker_level]['mean_ep_length_total']),
                    'r r: {:.1f}'.format(
                        stats_list[worker_level]['mean_reward_recent']),
                    'r l: {:.1f}'.format(
                        stats_list[worker_level]['mean_ep_length_recent']),
                    'eps: {:.2f}'.format(epsilon), 'loss: {:.1f}'.format(
                        stats_list[worker_level]['episode_loss']))

                stats_list[worker_level]['episode_loss'] = 0.
        else:
            state_list = next_state_list

        if ts > start_training_after:
            # train the agent
            # typical DQN gather experiences and trains once every iteration
            # train_every can modify that to 'train_every' many times every 'train_every'th iteration
            # example: if train_every=10 then train 10 times every 10th iteration
            if ts % train_every == 0:
                # pysyft federated learning training
                # copy model to each worker
                # each worker trains on its own data from its own replay buffer
                # updated models from each worker sent to a secure worker who updates the new model
                worker_dqn_list = []
                worker_dqn_target_list = []
                worker_opt_list = []
                for i in range(num_workers):
                    worker_dqn_list.append(dqn_agent.train_net.copy().send(
                        worker_list[i]))
                    worker_dqn_target_list.append(
                        dqn_agent.target_net.copy().send(worker_list[i]))
                    worker_opt_list.append(
                        optim.Adam(params=worker_dqn_list[i].parameters(),
                                   lr=learning_rate))

                for i in range(num_workers):
                    for _ in range(train_every):
                        # sample a batch from the replay buffer
                        x0, x1, a, r, d = replay_buffer_list[i].sample(
                            batch_size)
                        # turn batches into tensors and attack to GPU if available
                        state_batch = torch.FloatTensor(x0).to(device)
                        state_batch = torch.unsqueeze(state_batch, dim=1)
                        next_state_batch = torch.FloatTensor(x1).to(device)
                        next_state_batch = torch.unsqueeze(next_state_batch,
                                                           dim=1)
                        action_batch = torch.LongTensor(a).to(device)
                        reward_batch = torch.FloatTensor(r).to(device)
                        done_batch = torch.FloatTensor(1. - d).to(device)

                        # send data to worker
                        worker_state_batch = state_batch.send(worker_list[i])
                        worker_next_state_batch = next_state_batch.send(
                            worker_list[i])
                        worker_action_batch = action_batch.send(worker_list[i])
                        worker_reward_batch = reward_batch.send(worker_list[i])
                        worker_done_batch = done_batch.send(worker_list[i])

                        train_q = worker_dqn_list[i](
                            worker_state_batch).gather(1, worker_action_batch)

                        with torch.no_grad():
                            # Double DQN: get argmax values from train network, use argmax in target network
                            train_argmax = worker_dqn_list[i](
                                worker_next_state_batch).max(1)[1].view(
                                    batch_size, 1)
                            target_net_q = worker_reward_batch + worker_done_batch * discount * \
                                            worker_dqn_target_list[i](worker_next_state_batch).gather(1, train_argmax)

                        # get loss between train q values and target q values
                        # DQN implementations typically use MSE loss or Huber loss (smooth_l1_loss is similar to Huber)
                        # loss_fn = nn.MSELoss()
                        # loss = loss_fn(train_q, target_net_q)
                        loss = F.smooth_l1_loss(train_q, target_net_q)

                        # optimize the parameters with the loss
                        worker_opt_list[i].zero_grad()
                        loss.backward()
                        for param in worker_dqn_list[i].parameters():
                            param.grad.data.clamp_(-1, 1)
                        worker_opt_list[i].step()
                        # get loss stats
                        #print("loss is {}".format(loss))
                        temp_loss = loss.get()
                        #print("loss get is {}".format(temp_loss))
                        stats_list[i]['episode_loss'] += temp_loss.detach(
                        ).cpu().numpy()

                    # move the worker trained model to secure worker for updating the centralized DQN
                    worker_dqn_list[i].move(secure_worker)
                    with torch.no_grad():
                        # first worker replaces centralized DQN parameters, then do keep a running average as each new worker's params are found
                        if i == 0:
                            dqn_agent.train_net.load_state_dict(
                                worker_dqn_list[i].get().state_dict())
                        else:
                            tau = 1. / (1 + i)
                            temp_net = worker_dqn_list[i].get()
                            for dqn_var, temp_var in zip(
                                    dqn_agent.train_net.parameters(),
                                    temp_net.parameters()):
                                dqn_var.data.copy_((1. - tau) * dqn_var.data +
                                                   (tau) * temp_var.data)

            # save the network
            if ts % save_every == 0:
                save_string = "saved_models/dqn_model_{}_{}.pt".format(
                    time_int, ts)
                torch.save(dqn_agent.train_net.state_dict(), save_string)
                stats_save_string = "saved_models/stats_{}_{}.pickle".format(
                    time_int, ts)
                with open(stats_save_string, 'wb') as handle:
                    pickle.dump(stats_list, handle)
            # update the target network
            dqn_agent.update_target_network_soft(ts, update_target_every, tau)

    print("save final model")
    save_string = "saved_models/dqn_model_{}_FINAL.pt".format(time_int)
    torch.save(dqn_agent.train_net.state_dict(), save_string)
    stats_save_string = "saved_models/stats_{}_FINAL.pickle".format(time_int)
    with open(stats_save_string, 'wb') as handle:
        pickle.dump(stats_list, handle)
Ejemplo n.º 17
0
def train():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    main_utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank))
    print("==================================")
    print("Learning rate :{}, batch size: {}".format(Config.LR,
                                                     Config.BATCH_SIZE))

    act = deepq.learn(
        env,
        # network=Config.ARCHITECTURE,
        network="conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        lr=Config.LR,
        batch_size=Config.BATCH_SIZE,
        gamma=0.99,
        total_timesteps=Config.TOTAL_TIMESTEPS,
        buffer_size=Config.BUFFER_SIZE,
        print_freq=10,
        checkpoint_freq=Config.CHECKPOINT_FREQ,
        checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID),
        # render=Config.RENDER,
        callback=None,
        exploration_fraction=0.5,
        exploration_final_eps=0.1,
        prioritized_replay=True,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000)
    # act = deepq.learn(
    #                 env,
    #                 # network=Config.ARCHITECTURE,
    #                 network="impala_cnn",
    #                 lr=Config.LR,
    #                 batch_size=Config.BATCH_SIZE,
    #                 gamma=0.99,
    #                 total_timesteps=Config.TOTAL_TIMESTEPS,
    #                 buffer_size=Config.BUFFER_SIZE,
    #                 print_freq=10,
    #                 checkpoint_freq=Config.CHECKPOINT_FREQ,
    #                 checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID),
    #                 # render=Config.RENDER,
    #                 callback=None,
    #                 exploration_fraction=0.6,
    #                 exploration_final_eps=0.04,
    #                 prioritized_replay=True,
    #                 train_freq=4,
    #                 learning_starts=10000,
    #                 target_network_update_freq=1000
    #                 )
    print("Saving model to {}/saved_models".format(Config.SAVE_PATH))
    act.save("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))
Ejemplo n.º 18
0
def train(config):
    """Training process of DQN.

    """

    # Initialize the envrioment
    env = utils.Scalarize(coinrun_utils.make_general_env(1, seed=1))

    # Create log directory and save directory if it does not exist
    if not os.path.exists(config.log_dir):
        os.makedirs(config.log_dir)
    if not os.path.exists(config.save_dir):
        os.makedirs(config.save_dir)

    # Create summary writer
    start_time = time.time()
    st = datetime.datetime.fromtimestamp(start_time).strftime(
        '%Y-%m-%d %H:%M:%S')

    tr_writer = SummaryWriter(log_dir=os.path.join(
        config.log_dir, "DQN Traning {} c_rew {} numlvl {} seed {}".format(
            st, config.CUSTOM_REWARD_SHAPING, conrun_config.NUM_LEVELS,
            conrun_config.SET_SEED)))

    # Prepare checkpoint file and model file to save and load from
    checkpoint_file = os.path.join(config.save_dir, "checkpoint.pth")

    # Initialize training
    dqn = QLEARNING(config, env)

    # Make sure that the model is set for training
    dqn.Q_function.train()
    dqn.target_Q_function.eval(
    )  # target is never learning but getting copied from Q_function

    # Check for existing training results. If it existst, and the configuration
    # is set to resume `config.resume==True`, resume from previous training. If
    # not, delete existing checkpoint.
    if os.path.exists(checkpoint_file):
        if config.resume:
            print("Checkpoint found! Resuming")
            # Read checkpoint file.
            load_res = torch.load(checkpoint_file, map_location="cpu")
            dqn.Q_function.load_state_dict(load_res["model"])
            dqn.update_target_model()
            dqn.optimizer.load_state_dict(load_res["optimizer"])
        else:
            os.remove(checkpoint_file)
    max_avg_reward = 0.04
    # Training loop
    for i in range(config.num_episodes):
        state = env.reset()
        ep_reward = 0
        ep_length = 0
        terminated = False
        while True:
            if config.render_play:
                env.render()
            action = dqn.choose_action(state)

            next_state, reward, done, info = env.step(action)

            if config.CUSTOM_REWARD_SHAPING:
                reward, terminated = rewardShaping(action, terminated, done,
                                                   ep_length, reward, state,
                                                   next_state)
            ep_length += 1
            ep_reward += reward

            dqn.store_transition(state, action, reward, next_state, done)

            dqn.learn(tr_writer)

            if done:
                break
            state = copy.copy(next_state)
        print(
            "finished ep: {} , ep_rew {} len {}. esp {}. time_passed {}, memory_size {} "
            .format(i, ep_reward, ep_length, dqn.epsilon,
                    time.time() - start_time, dqn.memory.size()))
        tr_writer.add_scalar("ep_length", ep_length, global_step=i)
        ep_reward_norm = ep_reward
        if config.CUSTOM_REWARD_SHAPING:
            ep_reward_norm = ep_reward / 100  # so it will be in the same range as default reward in graph ploting
        avg_reward_norm = ep_reward_norm / ep_length

        tr_writer.add_scalar("ep_reward", ep_reward_norm, global_step=i)
        tr_writer.add_scalar("Avg. reward per step",
                             avg_reward_norm,
                             global_step=i)
        if config.test_while_train and avg_reward_norm > max_avg_reward and dqn.memory.size(
        ) > dqn.minimum_memory:
            if test(config, dqn.Q_function, 3):
                torch.save(
                    {
                        "current_ep": i,
                        "model": dqn.Q_function.state_dict(),
                        "optimizer": dqn.optimizer.state_dict(),
                    },
                    os.path.join(
                        config.save_dir,
                        "bestmodel_{}_cRew_{}_numlvl_{}_seed_{}.pth".format(
                            i, config.CUSTOM_REWARD_SHAPING,
                            conrun_config.NUM_LEVELS, conrun_config.SET_SEED)))
                max_avg_reward = avg_reward_norm
            dqn.Q_function.train()

        if (i % 50) == 0:  # hardcoded save checkpoint interval
            torch.save(
                {
                    "current_ep": i,
                    "model": dqn.Q_function.state_dict(),
                    "optimizer": dqn.optimizer.state_dict(),
                }, checkpoint_file)

    torch.save(
        {
            "current_ep": i,
            "model": dqn.Q_function.state_dict(),
            "optimizer": dqn.optimizer.state_dict(),
        },
        os.path.join(
            config.save_dir,
            "final_{}_rShaping_{}_numlvl_{}_seed_{}_time_{}.pth".format(
                i, config.CUSTOM_REWARD_SHAPING, conrun_config.NUM_LEVELS,
                conrun_config.SET_SEED, st)))
    env.close()
Ejemplo n.º 19
0
def enjoy_env_sess(sess, DIR_NAME):
    should_render = True
    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    rep_count = Config.REP
    mpi_print = utils.mpi_print

    file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID)
    f_io = open(file_name, 'a')

    if should_eval:
        if Config.TEST_NUM_EVAL > -1:
            env = utils.make_general_env(Config.TEST_NUM_EVAL)
        else:
            env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    agent = create_act_model(sess, env, nenvs)
    num_actions = env.action_space.n

    init_rand = tf.variables_initializer(
        [v for v in tf.global_variables() if 'randcnn' in v.name])
    sess.run(tf.compat.v1.global_variables_initializer())

    soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions],
                                name='soft_numpy')
    dist = tfp.distributions.Categorical(probs=soft_numpy)
    sampled_action = dist.sample()

    loaded_params = utils.load_params_for_scope(sess, 'model')

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    sess.run(init_rand)
    while should_continue():
        if Config.USE_LSTM == 8425 or Config.USE_LSTM == 1081:
            q_actions, values, state, _ = agent.step(obs, state, done)
            # e-greedy
            greedy_flag = np.random.rand(q_actions.shape[0])
            greedy_flag = greedy_flag < 0.1
            greedy_flag.astype(np.int)
            random_actions = np.random.randint(0,
                                               num_actions,
                                               size=q_actions.shape[0])
            action = random_actions * greedy_flag + (1 -
                                                     greedy_flag) * q_actions
        else:
            total_soft = agent.get_softmax(obs, state, done)
            action = sess.run([sampled_action], {soft_numpy: total_soft})
            action = action[0]
            #action, values, state, _ = agent.step(obs, state, done)

        obs, rew, done, info = env.step(action)
        #scipy.misc.imsave('raw_inputs.png', obs[0])
        #print(dd)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = 0

    if should_eval:
        mean_score = np.mean(scores) / rep_count
        max_idx = np.argmax(scores)

        result = mean_score

        f_io.write("{}\n".format(result))
        f_io.close()

    return result