def train(): env = gym.make("CartPole-v1") agent = CartPoleDRQNAgent(DRQN, model=DRQN(), env=env, exploration=LinearSchedule(10000, initial_p=1.0, final_p=0.02), batch_size=4, memory_size=100000, min_mem=100) agent.saving_dir = '/home/ur5/thesis/rdd_rl/gym_test/cartpole/data/drqn' agent.train(10000, 10000, 100, False)
def train(): env = gym.make("CartPole-v1") agent = CartPoleDRQNAgent(DRQN, model=DRQN(), env=env, exploration=LinearSchedule(100000, initial_p=1.0, final_p=0.1), batch_size=32, target_update_frequency=20, memory_size=1000) agent.saving_dir = '/home/ur5/thesis/rdd_rl/gym_test/data/drqn_cartpole_partial' agent.train(100000, 200, 100, False)
def plot(checkpoint): env = None agent = CartPoleDRQNAgent(DRQN, model=DRQN(), env=env, exploration=LinearSchedule(100000, initial_p=1.0, final_p=0.1), batch_size=32, target_update_frequency=20) agent.saving_dir = '/home/ur5/thesis/rdd_rl/gym_test/data/drqn_cartpole_partial' agent.loadCheckpoint(checkpoint) plotLearningCurve(agent.episode_rewards, window=20) plt.show()
def train(): env = gym.make("CartPole-v1") agent = CartPoleDRQNAgent(DQN, model=DQN(), env=env, exploration=LinearSchedule(10000, initial_p=1.0, final_p=0.02), batch_size=32) agent.saving_dir = '/home/ur5/thesis/rdd_rl/gym_test/data/dqn_cartpole' agent.train(10000, 500, 10000, False)
def train(): env = gym.make("CartPole-v1") agent = CartPoleDRQNSliceAgent(DRQN, model=DRQN(), env=env, exploration=LinearSchedule(100000, initial_p=1.0, final_p=0.02), batch_size=32, memory_size=100000, min_mem=10000, sequence_len=32) agent.saving_dir = '/home/ur5/thesis/rdd_rl/gym_test/cartpole/data/partial_drqn_slice' agent.loadCheckpoint('20190212204507') agent.train(100000, 500, 200, False)
final_mask_batch.append( torch.tensor(list(episode_transition.final_mask), dtype=torch.uint8)) pad_mask_batch.append( torch.tensor(list(episode_transition.pad_mask), dtype=torch.uint8)) state = (torch.stack(state_1_batch).to(self.device), torch.stack(state_2_batch).to(self.device)) action = torch.stack(action_batch).to(self.device) next_state = (torch.stack(next_state_1_batch).to(self.device), torch.stack(next_state_2_batch).to(self.device)) reward = torch.stack(reward_batch).to(self.device) final_mask = torch.stack(final_mask_batch).to(self.device) pad_mask = torch.stack(pad_mask_batch) non_pad_mask = 1 - pad_mask return state, action, next_state, reward, final_mask, non_pad_mask if __name__ == '__main__': agent = ConvDRQNAgent(DRQN, model=DRQN(), env=ScoopEnv(), exploration=LinearSchedule(10000, initial_p=1.0, final_p=0.1), min_mem=1000) agent.loadCheckpoint('20190212192630') agent.train(100000, max_episode_steps=200)
def forward(self, x, hidden=None): x = x.float() / 256 shape = x.shape x = x.view(shape[0] * shape[1], shape[2], shape[3], shape[4]) conv_out = self.conv(x) x = conv_out.view(shape[0], shape[1], -1) if hidden is None: x, hidden = self.lstm(x) else: x, hidden = self.lstm(x, hidden) x = self.fc(x) return x, hidden if __name__ == '__main__': env = gym.make('PongNoFrameskip-v4') env = wrap_drqn(env) agent = DRQNAgent(DRQN, model=DRQN(env.observation_space.shape, env.action_space.n), env=env, exploration=LinearSchedule(100000, 0.02), batch_size=1, target_update_frequency=1000, memory_size=100000, min_mem=10000) agent.saving_dir = '/home/ur5/thesis/rdd_rl/gym_test/pong/data/drqn' agent.train(10000, 10000, save_freq=50)
self.episode_rewards = checkpoint['episode_rewards'] self.episode_lengths = checkpoint['episode_lengths'] self.policy_net = HistoryDQN() self.policy_net.load_state_dict(checkpoint['policy_state_dict']) self.policy_net = self.policy_net.to(DEVICE) self.policy_net.train() self.target_net = HistoryDQN() self.target_net.load_state_dict(checkpoint['policy_state_dict']) self.target_net = self.target_net.to(DEVICE) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters()) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if __name__ == '__main__': agent = SynDQNAgent( 20005, 4, LinearSchedule(10000, 0.1), batch_size=256, saving_dir='/home/ur5/thesis/simple_task/scoop_grasp_2d/data/sync_dqn') agent.load_checkpoint('20181205143445') # agent.train(10000) plotLearningCurve(agent.episode_rewards) plt.show() plotLearningCurve(agent.episode_lengths, label='length', color='r') plt.show()
action_batch.append(torch.cat(episode_transition.action)) reward_batch.append(torch.cat(episode_transition.reward)) final_mask_batch.append(torch.tensor(list(episode_transition.final_mask), dtype=torch.uint8)) pad_mask_batch.append(torch.tensor(list(episode_transition.pad_mask), dtype=torch.uint8)) state = (torch.stack(state_0_batch).to(self.device), torch.stack(state_1_batch).to(self.device)) action = torch.stack(action_batch).to(self.device) next_state = (torch.stack(next_state_0_batch).to(self.device), torch.stack(next_state_1_batch).to(self.device)) reward = torch.stack(reward_batch).to(self.device) final_mask = torch.stack(final_mask_batch).to(self.device) pad_mask = torch.stack(pad_mask_batch) non_pad_mask = 1 - pad_mask return state, action, next_state, reward, final_mask, non_pad_mask if __name__ == '__main__': envs = [] for i in range(1): env = ScoopEnv(19997 + i) envs.append(env) agent = Agent(DRQN(envs[0].observation_space[0].shape, envs[0].observation_space[1].shape, 4), envs, LinearSchedule(10000, 0.1), batch_size=128, min_mem=1000) agent.saving_dir = '/home/ur5/thesis/rdd_rl/scoop_vision/data/syn_drqn_dense' agent.loadCheckpoint('20190220221354') agent.train(100000, 200, save_freq=500)