def main(): # Init env env = gym.make(env_name) # l_obs = env.observation_space.shape[0] l_obs = 1 n_action = env.action_space.n epsilon = 0.5 epsilon_end = 0.01 epsilon_div = 0.025 epsilon_step = ((epsilon - epsilon_end) / epsilon_div) # Init net model net = ConvNet(l1, l2, l_obs, n_action) net.apply(model.weight_init) optimizer = torch.optim.Adam(net.parameters(), lr=lr) # Iteration start for iteration in range(num_iter): buff = [] print("iterations: ", iteration + 1, "/ ", num_iter) # Epoch start for epoch in range(num_epoch): obs = env.reset() next_obs = None reward = 0 total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is None: obs = process_frame(obs) obs = torch.Tensor(obs).unsqueeze(0) action = action_decide(net, obs, n_action, epsilon) next_obs, reward, done, info = env.step(action) next_obs = process_frame(next_obs) next_obs = torch.Tensor(next_obs).unsqueeze(0) traj.add(obs, action, reward, next_obs) total_reward += reward obs = next_obs if iteration % 10 == 0 and epoch == 1: env.render() if done: buff.append(traj) reward_list.append(total_reward) print('reward: ', total_reward) A2C(net, optimizer, buff) if epsilon > epsilon_end: epsilon -= epsilon_step else: epsilon = epsilon_end env.close() return reward_list
def main(): # Init env env = gym.make(env_name) l_obs = 4 n_action = env.action_space.n # Init net model actor = Net(num_fc_a, num_fc_a, l_obs, n_action) critic = Net(num_fc_c, num_fc_c, l_obs, 1) actor.apply(model.weight_init) critic.apply(model.weight_init) optimizer_a = torch.optim.Adam(actor.parameters(), lr=0.01) optimizer_c = torch.optim.Adam(critic.parameters(), lr=0.01) # Iteration start for iteration in range(num_iter): buff = [] # Epoch start for epoch in range(num_epoch): obs = env.reset() next_obs = None reward = 0 total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is not None: obs = next_obs obs = torch.Tensor(obs).float() action = action_decide(actor, obs) next_obs, reward, done, info = env.step(action) next_obs = torch.tensor(next_obs).float() traj.add(obs, action, reward, next_obs) total_reward += reward # if epoch == 1: # env.render() if done: buff.append(traj) reward_list.append(total_reward) A2C(actor, critic, optimizer_a, optimizer_c, buff) env.close() return reward_list
def main(): # Init env env = gym.make(env_name) l_obs = env.observation_space.shape[0] n_action = env.action_space.n # Init net model net = Net_LSTM(num_fc_a, num_fc_a, l_obs, n_action) net.apply(model.weight_init) # optimizer = torch.optim.Adam(net.parameters(), lr=0.01) # Iteration start for iteration in range(num_iter): buff = [] # Epoch start for epoch in range(num_epoch): obs = env.reset() next_obs = None reward = 0 total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is not None: obs = next_obs obs = torch.Tensor(obs).float() action = action_decide(net, obs) next_obs, reward, done, info = env.step(action) next_obs = torch.tensor(next_obs).float() traj.add(obs, action, reward, next_obs) total_reward += reward # if iteration == 90: # env.render() if done: buff.append(traj) reward_list.append(total_reward) A2C(net, optimizer, buff) env.close() return reward_list