Example #1
0
def main():
    global_seed()

    map_name = "4x4"
    env = init_env(map_name)

    model = Model(
        policy=PolicyFullyConnected,
        observation_space = env.observation_space,
        action_space = env.action_space,
        learning_rate = 1e-3,
        nsteps=3,
        decay=0.99
    )

    runner = Runner(
        env = env,
        model = model,
        num_steps= 3,
        discount_rate = 0.99,
        save_summary_steps=1000,
        performance_num_episodes=100
    )

    time_steps = 3000

    for _ in range(time_steps):
        runner.run()
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--timesteps', default=int(1e6))
    parser.add_argument('--batch_size', default=128)
    parser.add_argument('--discount_rate', default=0.99)
    parser.add_argument('--summary_frequency', default=20000)
    parser.add_argument('--performance_num_episodes', default=10)
    parser.add_argument('--summary_log_dir', default="reinforce_lstm")
    args = parser.parse_args()
    global_seed(0)
    env = init_environment()

    model = Model(policy=LstmPolicy,
                  observation_space=env.observation_space,
                  action_space=env.action_space,
                  batch_size=args.batch_size)

    runner = Runner(env=env,
                    model=model,
                    batch_size=args.batch_size,
                    discount_rate=args.discount_rate,
                    summary_frequency=args.summary_frequency,
                    performance_num_episodes=args.performance_num_episodes,
                    summary_log_dir=args.summary_log_dir)

    for _ in range(0, args.timesteps // args.batch_size + 1):
        observations, states, rewards, terminals, actions = runner.run()
        model.train(observations, states, rewards, terminals, actions)
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--timesteps', default=int(1e6))
    parser.add_argument('--learning_rate', default=2e-4)
    parser.add_argument('--num_steps', default=5)
    parser.add_argument('--discount_rate', default=0.99)
    parser.add_argument('--summary_frequency', default=20000)
    parser.add_argument('--performance_num_episodes', default=10)
    parser.add_argument('--summary_log_dir', default="a2c")
    args = parser.parse_args()
    global_seed(0)
    env = init_environment()

    model = Model(policy=PolicyFullyConnected,
                  observation_space=env.observation_space,
                  action_space=env.action_space,
                  learning_rate=args.learning_rate)

    runner = Runner(env=env,
                    model=model,
                    num_steps=args.num_steps,
                    discount_rate=args.discount_rate,
                    summary_frequency=args.summary_frequency,
                    performance_num_episodes=args.performance_num_episodes,
                    summary_log_dir=args.summary_log_dir)

    for _ in range(0, (args.timesteps // args.num_steps) + 1):
        runner.run()
Example #4
0
def run():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--timesteps', default=int(1e6))
    parser.add_argument('--num_steps', default=5)
    parser.add_argument('--discount_rate', default=0.99)

    parser.add_argument('--learning_rate', default=2e-4)
    parser.add_argument('--summary_frequency', default=20000)
    parser.add_argument('--performance_num_episodes', default=10)
    parser.add_argument('--summary_log_dir', default="a2c")
    args = parser.parse_args()

    dimensions = Dimensions(screen=(32, 32), minimap=(1, 1))
    interfaceFormat = AgentInterfaceFormat(
        feature_dimensions=dimensions,
        use_feature_units=True,
    )

    global_seed(0)

    env = SC2Env(map_name="MoveToBeacon",
                 agent_interface_format=interfaceFormat,
                 step_mul=8,
                 random_seed=1)

    env = EnvWrapper(env)

    model = Model(policy=PolicyFullyConnected,
                  observation_space=env.observation_space,
                  action_space=env.action_space,
                  learning_rate=args.learning_rate,
                  spatial_resolution=(5, 5))

    runner = Runner(env=env,
                    model=model,
                    batch_size=args.num_steps,
                    discount_rate=args.discount_rate,
                    summary_log_dir=args.summary_log_dir,
                    summary_frequency=args.summary_frequency,
                    performance_num_episodes=args.performance_num_episodes)

    for _ in range(0, (args.timesteps // args.num_steps) + 1):
        runner.run()
Example #5
0
def run():
    global_seed(0)
    env = init_env()

    model = Model(
        policy=PolicyFullyConnected,
        observation_space=(80, 80),
        action_space=3,
        learning_rate=2e-4
    )

    dir = "reinforce"

    runner = Runner(
        env = env,
        model = model,
        batch_size=128,
        timesteps=int(1e6),
        discount_rate=0.99,
        summary_frequency=20000,
        performance_num_episodes=100,
        summary_log_dir=dir
    )
    runner.run()
Example #6
0
def run():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--timesteps', default=int(1e6))
    parser.add_argument('--num_steps', default=128)
    parser.add_argument('--entropy_coefficient', default=0.01)
    parser.add_argument('--learning_rate', default=2e-4)
    parser.add_argument('--gae_gamma', default=0.99)
    parser.add_argument('--gae_lambda', default=0.95)
    parser.add_argument('--num_batches', default=4)
    parser.add_argument('--num_training_epochs', default=4)
    parser.add_argument('--clip_range', default=0.2)
    parser.add_argument('--summary_frequency', default=20000)
    parser.add_argument('--performance_num_episodes', default=10)
    parser.add_argument('--summary_log_dir', default="ppo_fc")
    args = parser.parse_args()

    dimensions = Dimensions(screen=(32, 32), minimap=(1, 1))
    interface_format = AgentInterfaceFormat(
        feature_dimensions=dimensions,
        use_feature_units=True,
    )

    global_seed(0)
    batch_size = args.num_steps // args.num_batches
    env = SC2Env(map_name="MoveToBeacon",
                 agent_interface_format=interface_format,
                 step_mul=8,
                 random_seed=1)

    env = EnvWrapper(env)

    model = Model(policy=PolicyFullyConnected,
                  observation_space=env.observation_space,
                  action_space=env.action_space,
                  learning_rate=args.learning_rate,
                  spatial_resolution=(5, 5),
                  clip_range=args.clip_range,
                  entropy_coefficient=args.entropy_coefficient)

    runner = Runner(env=env,
                    model=model,
                    num_steps=args.num_steps,
                    advantage_estimator_gamma=args.gae_gamma,
                    advantage_estimator_lambda=args.gae_lambda,
                    summary_frequency=args.summary_frequency,
                    performance_num_episodes=args.performance_num_episodes,
                    summary_log_dir=args.summary_log_dir)

    for _ in range(0, (args.timesteps // args.num_steps) + 1):
        assert args.num_steps % args.num_batches == 0
        step = runner.run()
        observations = np.asarray(step[0])
        actions = np.asarray(step[1])
        available_actions = np.asarray(step[2])
        actions_spatial = np.asarray(step[3])
        actions_spatial_mask = np.asarray(step[4])
        advantage_estimations = np.asarray(step[5])
        values = np.asarray(step[6])
        probs = np.asarray(step[7])
        probs_spatial = np.asarray(step[8])
        indexes = np.arange(args.num_steps)

        for _ in range(args.num_training_epochs):
            np.random.shuffle(indexes)

            for i in range(0, args.num_steps, batch_size):
                shuffled_indexes = indexes[i:i + batch_size]
                model.train(
                    observations=[
                        observations[0][shuffled_indexes],
                        observations[1][shuffled_indexes],
                        observations[2][shuffled_indexes]
                    ],
                    actions=actions[shuffled_indexes],
                    available_actions_mask=available_actions[shuffled_indexes],
                    actions_spatial=actions_spatial[shuffled_indexes],
                    actions_spatial_mask=actions_spatial_mask[
                        shuffled_indexes],
                    advantages=advantage_estimations[shuffled_indexes],
                    values=values[shuffled_indexes],
                    probs=probs[shuffled_indexes],
                    probs_spatial=probs_spatial[shuffled_indexes])
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--total_timesteps', default=int(1000000))
    parser.add_argument('--num_steps', default=128)
    parser.add_argument('--ent_coef', default=0.01)
    parser.add_argument('--learning_rate', default=3e-4)
    parser.add_argument('--vf_coef', default=0.5)
    parser.add_argument('--gae_gamma', default=0.99)
    parser.add_argument('--gae_lambda', default=0.95)
    parser.add_argument('--num_batches', default=4)
    parser.add_argument('--num_training_epochs', default=4)
    parser.add_argument('--clip_range', default=0.2)
    parser.add_argument('--summary_frequency', default=20000)
    parser.add_argument('--performance_num_episodes', default=10)
    parser.add_argument('--summary_log_dir', default="ppo_fc")
    args = parser.parse_args()

    global_seed(0)
    env = init_environment()
    batch_size = args.num_steps // args.num_batches

    model = Model(policy=PolicyFullyConnected,
                  observation_space=env.observation_space,
                  action_space=env.action_space,
                  batch_size=batch_size,
                  ent_coef=args.ent_coef,
                  vf_coef=args.vf_coef)

    runner = Runner(env=env,
                    model=model,
                    num_steps=args.num_steps,
                    advantage_estimator_gamma=args.gae_gamma,
                    advantage_estimator_lambda=args.gae_lambda,
                    summary_frequency=args.summary_frequency,
                    performance_num_episodes=args.performance_num_episodes,
                    summary_log_dir=args.summary_log_dir)

    for _ in range(0, (args.total_timesteps // args.num_steps) + 1):
        assert args.num_steps % args.num_batches == 0
        observations, advantages, masks, actions, values, log_probs = runner.run(
        )
        indexes = np.arange(args.num_steps)  # [0,1,2 ..., 127]

        for _ in range(args.num_training_epochs):
            np.random.shuffle(indexes)

            for i in range(0, args.num_steps, batch_size):
                # 0
                # 32
                # 64
                # 96
                shuffled_indexes = indexes[i:i + batch_size]

                model.train(learning_rate=args.learning_rate,
                            clip_range=args.clip_range,
                            observations=observations[shuffled_indexes],
                            advantages=advantages[shuffled_indexes],
                            actions=actions[shuffled_indexes],
                            values=values[shuffled_indexes],
                            log_probs=log_probs[shuffled_indexes])