def main(env, num_timesteps):
    def stopping_criterion(env):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(2000000, 0.05)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
Ejemplo n.º 2
0
def main(env, num_timesteps, config):
    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        config=config,
        env=env,
        q_func=VIN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
Ejemplo n.º 3
0
def main(env, num_timesteps):
    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    run = runs.runs[RUN_INDEX]
    exploration_schedule = run.schedule

    print("Starting {}; max_timesteps = {}".format(run.run_name, task.max_timesteps))

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        statistics_file_name=run.statistics_file_name
    )
Ejemplo n.º 4
0
def main(env, num_timesteps, experiment_config, experiment_name):

    q_func = DQNLRelu if experiment_config['adv_model'] else DQN

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=experiment_config['lr'],
                    alpha=experiment_config['alpha'],
                    eps=experiment_config['eps']),
    )

    exploration_schedule = LinearSchedule(1000000,
                                          experiment_config['min_eps'])

    dqn_learing(experiment_name=experiment_name,
                env=env,
                q_func=q_func,
                optimizer_spec=optimizer_spec,
                exploration=exploration_schedule,
                stopping_criterion=stopping_criterion,
                replay_buffer_size=experiment_config['replay_size'],
                batch_size=experiment_config['batch'],
                gamma=experiment_config['gamma'],
                learning_starts=experiment_config['learning_start'],
                learning_freq=experiment_config['learning_freq'],
                frame_history_len=experiment_config['frame_hist'],
                target_update_freq=experiment_config['target_update_freq'],
                output_path=experiment_config['output'])
Ejemplo n.º 5
0
def main(env):
	global args
	args = parser.parse_args()

	optimizer_spec = OptimizerSpec(
		constructor=optim.RMSprop,
		kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
	)

	exploration_schedule = LinearSchedule(1000000, 0.1)

	dqn_learing(
		env=env,
		q_func=DQN,
		checkpoint_path=args.checkpoint,
		optimizer_spec=optimizer_spec,
		exploration=exploration_schedule,
		stopping_criterion=None,
		replay_buffer_size=REPLAY_BUFFER_SIZE,
		batch_size=BATCH_SIZE,
		gamma=GAMMA,
		learning_starts=LEARNING_STARTS,
		learning_freq=LEARNING_FREQ,
		frame_history_len=FRAME_HISTORY_LEN,
		target_update_freq=TARGET_UPDATE_FREQ,
	   )
Ejemplo n.º 6
0
def main(env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    # define learning rate and exploration schedules below
    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2,  5e-5 * lr_multiplier),
    ], outside_value=5e-5 * lr_multiplier)

    optimizer_spec = OptimizerSpec(
        constructor=optim.Adam,
        kwargs=dict(eps=1e-4),
        lr_schedule=lr_schedule
    )

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ], outside_value=0.01)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion(num_timesteps),
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        grad_norm_clipping=GRAD_NORM_CLIPPING
    )
Ejemplo n.º 7
0
def main(env):

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        num_actions1=num_actions1,
        num_actions2=num_actions2
    )
Ejemplo n.º 8
0
def main(env, num_timesteps):
    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = random.randint(0,100)  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    # empty dict to hold all results
    Stats = {}

    new_lr = 0.001
    new_gamma = 0.999
    exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05),
                         ConstantSchedule(0.15), LinearSchedule(500000, 0.05)]

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["lr=0.001, gamma=0.999"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=new_gamma,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested="lr=0.001, gamma=0.999"
    )

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["Default"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested=""
    )

    plt.clf()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Reward (past 100 episodes)')
    num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999")
    num_items = len(Stats["Default"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default")
    plt.legend()
    plt.title("Performance")
    plt.savefig('Final-Performance.png')