def train(net, optimizer, discount_f): env = gym.make(env_name) # Iteration start for _ in range(num_iter): buff = [] # episode start for i_episode in range(num_episode): obs = env.reset() next_obs = None reward = None total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is not None: obs = next_obs obs = torch.tensor(obs).float() action = action_decide(net, obs) next_obs, reward, done, info = env.step(action) traj.add(obs, action, reward) total_reward += reward # if _ % 5 == 0 and i_episode == 0: # env.render() if done: buff.append(traj) reward_list.append(total_reward) pg(net, optimizer, buff, discount_f) env.close() return reward_list
def main(): epsilon = 0.5 epsilon_end = 0.01 epsilon_div = 1e4 epsilon_step = ((epsilon - epsilon_end) / epsilon_div) env = atari_env(env_name) l_obs = env.observation_space.shape[0] n_action = env.action_space.n date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) dir = 'runs/Breakout_a2c_v2_experiment_epoch5000/' + date writer = SummaryWriter(dir) net = ConvNet_LSTM(l_obs, n_action).to(DEVICE) net.apply(model.weight_init) optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE) results_reward = [] for i_iteration in range(NUM_ITER): buff = [] net.reset_lstm() obs = env.reset() obs = torch.Tensor(obs).unsqueeze(0) # obs = torch.Tensor(obs) total_reward = 0 done = False traj = Trajectory() while not done: action = action_decide(net, obs, epsilon) next_obs, reward, done, _ = env.step(action) traj.add(obs, action, reward) total_reward += reward obs = next_obs obs = torch.Tensor(obs).unsqueeze(0) # if i_episode == 0: # env.render() # time.sleep(0.03) if done: results_reward.append(total_reward) writer.add_scalar("Reward/epoch", total_reward, i_iteration + 1) print('iteration: ', i_iteration + 1, '/ ', NUM_ITER, ' reward: ', total_reward) A2C(net, optimizer, traj) if epsilon > epsilon_end: epsilon -= epsilon_step else: epsilon = epsilon_end env.close() writer.flush() writer.close() return results_reward
def main(): env = gym.make(env_name) l_obs = 1 n_action = env.action_space.n date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) dir = 'runs/Breakout_a2c_v2_experiment_epoch5000/' + date writer = SummaryWriter(dir) net = ConvNet_LSTM(l_obs, n_action, device=DEVICE).to(DEVICE) net.apply(model.weight_init) optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE) results_reward = [] for i_iteration in range(NUM_ITER): buff = [] avg_reward = 0 for i_episode in range(NUM_EPISODE): net.reset_lstm() obs = env.reset() obs = process_frame(obs) obs = torch.Tensor(obs).unsqueeze(0) total_reward = 0 done = False traj = Trajectory() while not done: action = action_decide(net, obs) next_obs, reward, done, _ = env.step(action) traj.add(obs, action, reward) total_reward += reward obs = next_obs obs = process_frame(obs) obs = torch.Tensor(obs).unsqueeze(0) # if i_episode == 0: # env.render() # time.sleep(0.03) if done: buff.append(traj) results_reward.append(total_reward) avg_reward += total_reward writer.add_scalar( "Reward/epoch", total_reward, i_iteration * NUM_EPISODE + (i_episode + 1)) print('iteration: ', i_iteration + 1, '/ ', NUM_ITER, ' average reward: ', avg_reward / NUM_EPISODE) A2C(net, optimizer, buff) env.close() writer.flush() writer.close() return results_reward
def main(): env = gym.make(env_name) l_obs = env.observation_space.shape[0] - 1 n_action = env.action_space.n date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) dir = 'runs/cartpole_pg_v6_experiment_epoch5000/' + date writer = SummaryWriter(dir) if use_lstm: net = Net_LSTM(64, 32, l_obs, n_action) else: net = Net(64, 32, l_obs, n_action) net.apply(model.weight_init) optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE) results_reward = [] for i_iteration in range(NUM_ITER): buff = [] for i_episode in range(NUM_EPISODE): net.reset_lstm() obs = env.reset() obs = np.delete(obs, 1) obs = torch.Tensor(obs).unsqueeze(0).unsqueeze(0) next_obs = None reward = None total_reward = 0 done = False traj = Trajectory() while not done: action = action_decide(net, obs) next_obs, reward, done, _ = env.step(action) traj.add(obs, action, reward) total_reward += reward next_obs = np.delete(next_obs, 1) obs = next_obs obs = torch.Tensor(obs).unsqueeze(0).unsqueeze(0) # if _ % 5 == 0 and i_episode == 0: # env.render() if done: print('iteration: ', i_iteration, ' episode: ', i_episode, ' reward: ', total_reward) buff.append(traj) results_reward.append(total_reward) writer.add_scalar("Reward/epoch", total_reward, i_iteration * NUM_EPISODE + (i_episode + 1)) train(net, optimizer, buff) env.close() writer.flush() writer.close() return results_reward
def main(): # Init # Init env env = gym.make('CartPole-v0') # Init net model # TODO: Find a method to get the shape of obs l_obs = 4 n_action = env.action_space.n net = Net(128, 128, l_obs, n_action) net.apply(model.weight_init) # Init optim optimizer = torch.optim.Adam(net.parameters(), lr=0.01) # episode start for i_episode in range(num_episode): obs = env.reset() next_obs = None reward = 0 total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is not None: obs = next_obs obs = torch.tensor(obs).float() action = action_decide(net, obs) next_obs, reward, done, info = env.step(action) traj.add(obs, action, reward) total_reward += reward if i_episode % 100 == 0: env.render() if done: train(net, optimizer, traj) reward_list.append(total_reward) env.close() return reward_list
def main(): # Init env env = gym.make(env_name) l_obs = env.observation_space.shape[0] n_action = env.action_space.n # Init net model net = Net(num_fc_a, num_fc_a, l_obs, n_action) net.apply(model.weight_init) optimizer = torch.optim.Adam(net.parameters(), lr=lr) # Iteration start for iteration in range(num_iter): obs = env.reset() next_obs = None total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is not None: obs = next_obs obs = torch.Tensor(obs).float() action = action_decide(net, obs) next_obs, reward, done, info = env.step(action) next_obs = torch.tensor(next_obs).float() traj.add(obs, action, reward) total_reward += reward # if iteration == 90: # env.render() if done: reward_list.append(total_reward) A2C(net, optimizer, traj) env.close() return reward_list
def main(lr): # Init # Init env env = gym.make('LunarLander-v2') # Init net model # TODO: Find a method to get the shape of obs l_obs = env.observation_space.shape[0] n_action = env.action_space.n date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) dir = 'runs/cartpole_pg_v2_experiment_episode1000/' + date writer = SummaryWriter(dir) net = MetaNet(128, 128, l_obs, n_action) net.apply(model.weight_init) # # Inspect the net architecture # dummy_input = torch.rand(4) # writer.add_graph(net, dummy_input) # Init optim from utils import Adam_Optim optimizer = Adam_Optim(net) reward_list = [] # Iteration start for _ in range(num_iter): buff = [] # episode start for i_episode in range(num_episode): obs = env.reset() next_obs = None reward = 0 total_reward = 0 done = False traj = Trajectory() while not done: if next_obs is not None: obs = next_obs obs = torch.tensor(obs).float() action = action_decide(net, obs) next_obs, reward, done, info = env.step(action) traj.add(obs, action, reward) total_reward += reward if _ % 5 == 0 and i_episode == 0: env.render() if done: buff.append(traj) reward_list.append(total_reward) writer.add_scalar("Reward/epoch", total_reward, _ * num_episode + (i_episode + 1)) train(net, optimizer, buff) # if epsilon > epsilon_end: # epsilon -= epsilon_step env.close() writer.flush() writer.close() return reward_list