def main(args): game = "Breakout-v0" num_agents = 16 num_games = 8000 im_height, im_width = 84, 84 env = GymEnvImage(game, contexts=4, height=im_height, width=im_width, gray=True) d, h, w = env.observation_dims()["sensor"] num_actions = env.action_dims()["action"] # 1. Spawn one agent for each instance of environment. # Agent's behavior depends on the actual algorithm being used. Since we # are using SimpleAC, a proper type of Agent is SimpleRLAgent. agents = [] for _ in range(num_agents): agent = SimpleRLAgent(num_games, reward_shaping_f=np.sign) agent.set_env(GymEnvImage, game_name=game, contexts=4, height=im_height, width=im_width, gray=True) agents.append(agent) # 2. Construct the network and specify the algorithm. # We use a CNN as the perception net for the Actor-Critic algorithm cnn = nn.Sequential( nn.Conv2d(d, 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), Flatten(), # flatten the CNN cube to a vector nn.Linear(7 * 7 * 64, 512), nn.ReLU()) # 3. Specify the algorithm and settings for learning. ct_settings = get_settings(cnn, (d, h, w), num_actions, num_agents, name=args.name) # 4. Create Manager that handles the running of the whole pipeline manager = Manager(ct_settings) manager.add_agents(agents) manager.start()
def run(self, args): model = self.make_model(args) opt = optim.RMSprop(model.parameters(), lr=args.lr) alg = OffPolicyAC(model=model, optim=opt, epsilon=0.2, prob_entropy_weight=args.entropy_w, gpu_id=args.gpu) ct_settings = { "RL": dict( alg=alg, # sampling agent_helper=OnlineHelper, agents_per_batch=args.agents_per_batch, # each agent will call `learn()` every `sample_interval` steps sample_interval=args.history_len) } log_settings = dict(print_interval=args.log_interval) reward_shaping_f = lambda x: x / 100 agents = [] for _ in range(args.num_agents): agent = SimpleRLAgent(args.num_games, reward_shaping_f=reward_shaping_f) agent.set_env(GymEnv, game_name=args.game) agents.append(agent) # 4. Create Manager that handles the running of the whole pipeline manager = Manager(ct_settings, log_settings) manager.add_agents(agents) manager.start() # 5. compute last reward return np.mean(manager.stats['All'].data_q['total_reward'])
height=im_height, width=im_width, gray=True) env = env_class(**env_args) d, h, w = env.observation_dims()["sensor"] num_actions = env.action_dims()["action"] # 1. Spawn one agent for each instance of environment. # Agent's behavior depends on the actual algorithm being used. Since we # are using SimpleAC, a proper type of Agent is SimpleRLAgent. agents = [] for _ in range(num_agents): agent = SimpleRLAgent( num_games, reward_shaping_f=np.sign) # ignore reward magnitude agent.set_env(env_class, **env_args) agents.append(agent) # 2. Construct the network and specify the algorithm. # Here we use a small CNN as the perception net for the Actor-Critic algorithm cnn = nn.Sequential( nn.Conv2d(d, 32, kernel_size=5, padding=2), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(32, 32, kernel_size=5, padding=2), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(64, 64, kernel_size=3, padding=1),
contexts=4, height=im_height, width=im_width, gray=True) d, h, w = env.observation_dims()["sensor"] num_actions = env.action_dims()["action"] # 1. Spawn one agent for each instance of environment. # Agent's behavior depends on the actual algorithm being used. Since we # are using SimpleAC, a proper type of Agent is SimpleRLAgent. agents = [] for _ in range(num_agents): agent = SimpleRLAgent(num_games, reward_shaping_f=np.sign) agent.set_env(GymEnvImage, game_name=game, contexts=4, height=im_height, width=im_width, gray=True) agents.append(agent) # 2. Construct the network and specify the algorithm. # Here we use a small CNN as the perception net for the Actor-Critic algorithm cnn = nn.Sequential( nn.Conv2d(d, 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), Flatten(), # flatten the CNN cube to a vector nn.Linear(7 * 7 * 64, 512),
num_agents = 16 num_games = 8000 env = GymEnv(game) state_shape = env.observation_dims()["sensor"] num_actions = env.action_dims()["action"] # 1. Spawn one agent for each instance of environment. # Agent's behavior depends on the actual algorithm being used. Since we # are using SimpleAC, a proper type of Agent is SimpleRLAgent. reward_shaping_f = lambda x: x / 100.0 agents = [] for _ in range(num_agents): agent = SimpleRLAgent(num_games, reward_shaping_f=reward_shaping_f) agent.set_env(GymEnv, game_name=game) agents.append(agent) # 2. Construct the network and specify the algorithm. # Here we use a small MLP and apply the Actor-Critic algorithm mlp = nn.Sequential(nn.Linear(state_shape[0], 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU()) alg = SimpleAC(model=SimpleModelAC(dims=state_shape, num_actions=num_actions, perception_net=mlp), optim=(optim.RMSprop, dict(lr=5e-5)), gpu_id=-1) ## use cpu # 3. Specify the settings for learning: data sampling strategy