def main():
    logger.configure()
    env = make_atari('PongNoFrameskip-v4')
    env = deepq.wrap_atari_dqn(env)

    model = deepq.learn(
        env,
        "conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True,
        lr=1e-4,
        total_timesteps=int(1e7),
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
    )

    model.save('pong_model.pkl')
    env.close()
Beispiel #2
0
def main(trial,
         use_optuna,
         env,
         seed,
         entropy_coeff,
         n_epochs,
         dynamic_coeff,
         clip_norm,
         normalize_obs,
         buffer_size,
         max_path_length,
         min_pool_size,
         batch_size,
         policy_mode,
         eval_model,
         eval_n_episodes,
         eval_n_frequency,
         exploitation_ratio,
         return_queue=None,
         scale_reward=1.):
    if use_optuna:
        logger.configure(logger.get_dir(),
                         log_suffix="_optune{}".format(trial.number),
                         enable_std_out=False)
        logger.set_level(logger.DISABLED)
    tf.set_random_seed(seed=seed)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitation":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=exploitation_ratio)

    else:
        if policy_mode == "Knack-exploration" or policy_mode == "kurtosis":
            metric = "kurtosis"
        elif policy_mode in [
                "signed_variance", "negative_signed_variance",
                "small_variance", "large_variance"
        ]:
            metric = policy_mode
        elif "kurtosis-" in policy_mode:
            metric = policy_mode
        else:
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-exploration or Knack-exploration or signed_variance or variance"
            )

        policy = KnackBasedPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            vf=vf,
            reg=1e-3,
            squash=True,
            metric=metric,
            exploitation_ratio=exploitation_ratio,
            optuna_trial=trial,
        )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=eval_n_episodes,
        eval_deterministic=True,
        eval_n_frequency=eval_n_frequency)

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=scale_reward,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    if eval_model is None:
        avg_return = algorithm.train()
        if return_queue is not None:
            return_queue.put(avg_return)
        tf.reset_default_graph()
        algorithm._sess.close()
        del algorithm
        return avg_return

    else:
        return algorithm
Beispiel #3
0
    env = GymEnv(env_id)

    # set log directory
    root_dir = args.pop('root_dir')
    opt_log_name = args.pop('opt_log_name')
    logger2 = mylogger.get_logger()
    if args['eval_model'] is None:
        env_id = env.env_id
        # set log
        current_log_dir = root_dir
        logger2.set_log_dir(current_log_dir, exist_ok=True)
        logger2.set_save_array_flag(args.pop("save_array_flag"))
        if args["use_optuna"]:
            logger.set_level(logger.DISABLED)
        else:
            logger.configure(dir=current_log_dir, enable_std_out=False)

        # save parts of hyperparameters
        with open(os.path.join(current_log_dir, "hyparam.yaml"), 'w') as f:
            yaml.dump(args, f, default_flow_style=False)

    args.update({'env': env})
    # optuna
    if args["use_optuna"]:
        study = optuna.create_study(study_name='karino_{}_threshold_{}'.format(
            args["policy_mode"], env_id),
                                    storage='mysql://[email protected]/optuna',
                                    direction="maximize",
                                    load_if_exists=True)
        # study.optimize(lambda trial: main(trial, **args), timeout=24 * 60 * 60)
        study.optimize(lambda trial: wrap(trial, args), timeout=24 * 60 * 60)
def main():
    args = parse_args()
    logdir = args.pop('logdir')
    # logger.configure(dir=logdir, enable_std_out=True)
    logger.configure(dir=logdir, enable_std_out=False)
    with open(os.path.join(logger.get_dir(), "hyparam.yaml"), 'w') as f:
        yaml.dump(args, f, default_flow_style=False)

    policy_mode = args.pop('policy_mode')
    save_array_flag = args.pop('save_array_flag')
    use_my_env_wrapper = args.pop('use_my_env_wrapper')
    env_id = args.pop('env_id')

    if policy_mode == "large_variance":
        if args["exploitation_ratio_on_bottleneck"] is None or args[
                "bottleneck_threshold_ratio"] is None:
            raise AssertionError
        if args["exploitation_ratio_on_bottleneck"] is not None:
            array_logger = array_logger_getter.get_logger()
            array_logger.set_log_dir(logdir, exist_ok=True)
            array_logger.set_save_array_flag(save_array_flag)

    if use_my_env_wrapper:
        env = make_atari_nature(env_id)
    else:
        env = make_atari(env_id)
    env = deepq.wrap_atari_dqn(env)
    num_cpu = 1
    config = tf.ConfigProto(
        allow_soft_placement=True,
        inter_op_parallelism_threads=num_cpu,
        intra_op_parallelism_threads=num_cpu,
        gpu_options=tf.GPUOptions(visible_device_list=args.pop("gpu"),
                                  allow_growth=True),
    )
    config.gpu_options.allow_growth = True
    # nature_set = {'network': 'cnn', 'prioritized_replay': False, 'buffer_size': int(1e5), 'total_time_steps': int(2e6)}
    model = deepq.learn(
        env,
        "conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=False,
        lr=1e-4,
        # total_timesteps=int(1e7),
        # total_timesteps=int(2e3)+1,
        buffer_size=10000,
        # exploration_fraction=0.1,
        # exploration_final_eps=0.01,
        train_freq=4,
        # learning_starts=1000,
        # target_network_update_freq=100,
        learning_starts=1000,
        # target_network_update_freq=1000,
        target_network_update_freq=500,
        gamma=0.99,
        # prioritized_replay=False,
        batch_size=64,
        # print_freq=1,
        # print_freq=200,
        print_freq=1000,
        config=config,
        bottleneck_threshold_update_freq=1000,
        **args,
    )

    model.save(os.path.join(logger.get_dir(), 'Breakout_final_model.pkl'))
    env.close()
Beispiel #5
0
def main(seed=1, alg_type="Bottlenck"):
    env = gym.make('CliffMazeDeterministic-v0')
    eval_env = gym.make('CliffMazeDeterministic-v0')
    map_size = (env.nrow, env.ncol)
    optimal_steps = env.nrow + env.ncol - 2
    # env = gym.make('FrozenLakeDeterministic-v0')
    logger.configure(
        dir='/mnt/ISINAS1/karino/SubgoalGeneration/CliffMaze/CliffMaze{}x{}/{}'
        .format(*map_size, alg_type),
        log_suffix='seed{}'.format(seed),
        enable_std_out=False)
    a_dim = env.action_space.n
    s_dim = env.observation_space.n
    print(a_dim, s_dim)
    total_timesteps = 200000
    metric = "large_variance"
    if alg_type == "Bottleneck":
        # alg = KnackBasedQlearnig(state_dim=s_dim, action_dim=a_dim, gamma=0.99, alpha=0.3, epsilon=0.3, metric=metric, exploitation_ratio=0.01, decay_rate=total_timesteps)
        alg = KnackBasedQlearnig(state_dim=s_dim,
                                 action_dim=a_dim,
                                 gamma=0.99,
                                 alpha=0.3,
                                 epsilon=0.3,
                                 metric=metric,
                                 exploitation_ratio=0.1)
    elif alg_type == "EpsGreedy":
        # alg = Qlearning(state_dim=s_dim, action_dim=a_dim, gamma=0.99, alpha=0.3, epsilon=0.3, decay_rate=total_timesteps)
        alg = Qlearning(state_dim=s_dim,
                        action_dim=a_dim,
                        gamma=0.99,
                        alpha=0.3,
                        epsilon=0.3)
    else:
        raise NotImplementedError

    env.seed(seed)
    eval_env.seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    s0 = env.reset()
    env.render()
    traj = []
    count = 0
    steps = 0
    eval_per_steps = 30
    update_flag = False
    for i in range(total_timesteps):
        a = alg.act(state=s0, exploration=True)
        s1, r, done, _ = env.step(action=a)
        steps += 1
        # env.render()

        traj.append([s0, a, r, s1])
        s0 = s1

        if done:
            # achieve goal
            # if s1 == 80:
            if r > 0:
                update_flag = True
                count += 1
            alg.update(trajectory=traj)

            traj = []
            s0 = env.reset()
            done = False
            steps = 0

        if i % eval_per_steps == 0:
            if update_flag:  # to reduce computation time
                goal_steps = evaluation(eval_env, alg)
            else:
                goal_steps = np.nan
            logger.record_tabular('total_steps', i)
            logger.record_tabular('eval_goal_steps', goal_steps)
            logger.dump_tabular()
            if goal_steps == optimal_steps:
                print(i, goal_steps)
                break

    # print(alg.q_table)
    print(count)