def generate_config(): config = DQN.generate_config({}) config["frame_config"]["models"] = ["QNet", "QNet"] config["frame_config"]["model_kwargs"] = [{ "state_dim": 4, "action_num": 2 }] * 2 config["train_env_config"] = {} config["test_env_config"] = {} return config
self.fc1 = nn.Linear(state_dim, 16) self.fc2 = nn.Linear(16, 16) self.fc3 = nn.Linear(16, action_num) def forward(self, state): a = t.relu(self.fc1(state)) a = t.relu(self.fc2(a)) return self.fc3(a) if __name__ == "__main__": q_net = QNet(observe_dim, action_num) q_net_t = QNet(observe_dim, action_num) dqn = DQN(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum')) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state
self.fc1 = nn.Linear(state_dim, 16) self.fc2 = nn.Linear(16, 16) self.fc3 = nn.Linear(16, action_num) def forward(self, state): a = t.relu(self.fc1(state)) a = t.relu(self.fc2(a)) return self.fc3(a) if __name__ == "__main__": q_net = QNet(observe_dim, action_num) q_net_t = QNet(observe_dim, action_num) dqn = DQN(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum")) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) tmp_observations = [] while not terminal and step <= max_steps: step += 1 with t.no_grad():
# to mark the input/output device Manually # will not work if you move your model to other devices # after wrapping # q_net = static_module_wrapper(q_net, "cpu", "cpu") # q_net_t = static_module_wrapper(q_net_t, "cpu", "cpu") # q_net = static_module_wrapper(q_net, device, device) # q_net_t = static_module_wrapper(q_net_t, device, device) # to mark the input/output device Automatically # will not work if you model locates on multiple devices q_net = dynamic_module_wrapper(q_net) q_net_t = dynamic_module_wrapper(q_net_t) dqn = DQN(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum')) def fnTrain(): episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 iNumOfTrainSamples = env.fnNumIterations() afRewardArray = [] fMaxRewardSum = -np.inf while episode < iNumOfTrainSamples: episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1,
super(QNet, self).__init__() self.fc1 = nn.Linear(state_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, hidden_dim) self.fc3 = nn.Linear(hidden_dim, action_num) def forward(self, state): a = torch.relu(self.fc1(state)) a = torch.relu(self.fc2(a)) return self.fc3(a) qnet = QNet(4, 20, num_actions) qnet_t = QNet(4, 20, num_actions) dqn = DQN(qnet, qnet_t, torch.optim.Adam, nn.MSELoss(reduction='sum'), discount=0.8, epsilon_decay=0.999, learning_rate=0.001, lr_scheduler=torch.optim.lr_scheduler.StepLR, lr_scheduler_kwargs=[{"step_size": 1000*128}]) num_eps = 5000 norm_factor = 10000000 def test_delta(n=10): rew = [] for i in range(n): state = env.reset() done = False state = state[[0, 1, 2, 4]] while not done: action = state[3] - env.h new_state, reward, done = env.step(action)