def main(args):
    game = "Breakout-v0"

    num_agents = 16
    num_games = 8000

    im_height, im_width = 84, 84
    env = GymEnvImage(game,
                      contexts=4,
                      height=im_height,
                      width=im_width,
                      gray=True)
    d, h, w = env.observation_dims()["sensor"]
    num_actions = env.action_dims()["action"]

    # 1. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    agents = []
    for _ in range(num_agents):
        agent = SimpleRLAgent(num_games, reward_shaping_f=np.sign)
        agent.set_env(GymEnvImage,
                      game_name=game,
                      contexts=4,
                      height=im_height,
                      width=im_width,
                      gray=True)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    We use a CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(7 * 7 * 64, 512),
        nn.ReLU())

    # 3. Specify the algorithm and settings for learning.
    ct_settings = get_settings(cnn, (d, h, w),
                               num_actions,
                               num_agents,
                               name=args.name)

    # 4. Create Manager that handles the running of the whole pipeline
    manager = Manager(ct_settings)
    manager.add_agents(agents)
    manager.start()
def main(args):
    game = "Breakout-v0"

    num_agents = 16
    num_games = 8000

    # 1. Create image environments
    im_height, im_width = 84, 84
    envs = []
    for _ in range(num_agents):
        envs.append(
            GymEnvImage(game,
                        contexts=4,
                        height=im_height,
                        width=im_width,
                        gray=True))
    # context screens
    d, h, w = envs[-1].observation_dims()[0]
    num_actions = envs[-1].action_dims()[0]

    # 2. Construct the network and specify the algorithm.
    #    We use a CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(7 * 7 * 64, 512),
        nn.ReLU())

    # 3. Specify the algorithm and settings for learning.
    ct_settings = get_settings(cnn, (d, h, w),
                               num_actions,
                               num_agents,
                               name=args.name)

    # 4. Create Manager that handles the running of the whole pipeline
    manager = Manager(ct_settings)

    # 5. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used.
    for env in envs:
        agent = SimpleRLAgent(env, num_games, reward_shaping_f=np.sign)
        # An Agent has to be added into the Manager before we can use it to
        # interact with environment and collect data
        manager.add_agent(agent)

    manager.start()
    def run(self, args):
        model = self.make_model(args)
        opt = optim.RMSprop(model.parameters(), lr=args.lr)
        alg = OffPolicyAC(model=model,
                          optim=opt,
                          epsilon=0.2,
                          prob_entropy_weight=args.entropy_w,
                          gpu_id=args.gpu)

        ct_settings = {
            "RL":
            dict(
                alg=alg,
                # sampling
                agent_helper=OnlineHelper,
                agents_per_batch=args.agents_per_batch,
                # each agent will call `learn()` every `sample_interval` steps
                sample_interval=args.history_len)
        }

        log_settings = dict(print_interval=args.log_interval)

        reward_shaping_f = lambda x: x / 100
        agents = []
        for _ in range(args.num_agents):
            agent = SimpleRLAgent(args.num_games,
                                  reward_shaping_f=reward_shaping_f)
            agent.set_env(GymEnv, game_name=args.game)
            agents.append(agent)

        # 4. Create Manager that handles the running of the whole pipeline
        manager = Manager(ct_settings, log_settings)
        manager.add_agents(agents)
        manager.start()

        # 5. compute last reward
        return np.mean(manager.stats['All'].data_q['total_reward'])
Exemple #4
0
                   gpu_id=1)

    # 3. Specify the settings for learning: data sampling strategy
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL":
        dict(
            algorithm=alg,
            hyperparas=dict(grad_clip=5.0),
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=5,
            num_agents=num_agents)
    }

    # 4. Create Manager that handles the running of the whole pipeline
    manager = Manager(ct_settings)

    # 5. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    for env in envs:
        agent = SimpleRLAgent(env, num_games, reward_shaping_f=np.sign)
        # An Agent has to be added into the Manager before we can use it to
        # interact with environment and collect data
        manager.add_agent(agent)

    manager.start()
Exemple #5
0
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL": dict(
            algorithm=alg,
            hyperparas=dict(lr=5e-5),
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=4,
            num_agents=num_agents)
    }

    # 4. Create Manager that handles the running of the whole pipeline
    manager = Manager(
        ct_settings,
        log_settings=dict(
            print_interval=100, model_dir="/tmp/test", model_save_interval=10))

    # 5. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    reward_shaping_f = lambda x: x / 100.0
    for env in envs:
        agent = SimpleRLAgent(env, num_games, reward_shaping_f)
        # An Agent has to be added into the Manager before we can use it to
        # interact with environment and collect data
        manager.add_agent(agent)

    manager.start()
Exemple #6
0
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL":
        dict(
            algorithm=alg,
            hyperparas=dict(lr=5e-5),
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=4,
            num_agents=num_agents)
    }

    # 4. Create Manager that handles the running of the whole pipeline
    manager = Manager(
        ct_settings, log_settings=dict(model_dir="/tmp/test", pass_num=0)
    )  ## if pass_num is less than 1, then load the newest model

    # 5. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    reward_shaping_f = lambda x: x / 100.0
    for env in envs:
        agent = SimpleRLAgent(env, num_games, reward_shaping_f)
        # An Agent has to be added into the Manager before we can use it to
        # interact with environment and collect data
        manager.add_agent(agent)

    manager.start()