def sample_from_Replay_Memory(self, batches, ReplayMemory, Net): current_states = [] actions = [] q_values = [] for samples in ReplayMemory.sample(batches): state, action, reward, next_state, is_done = [samples.state, samples.action, samples.reward, samples.next_state, samples.done] next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0) current_states.append(state) actions.append(action) target = reward if not is_done: target = reward + self.gamma * np.amax(Net.predict(next_state)[0]) target_f = Net.predict(state)[0] target_f[action] = target q_values.append(target_f) current_states = np.reshape(current_states, (-1, DIM_STATES)) q_values = np.reshape((q_values), (-1, NUM_ACTIONS)) return current_states, actions, q_values
class LearnerDQN: ''' Learner class - abstraction which includes configuration of experiment, necessary models, and all actions needed for conduction. ''' def __init__(self, clip_grad=True, num_episodes=50, trajectory_len=MAX_STEPS, custom_func=None, custom_func_args=None ): ''' Initialization :param clip_grad: bool: flag for clipping gradients with value 1 :param num_episodes: number of episodes to run :param trajectory_len: maximal number of steps in each trajectory :param custom_func: custom reward function :param custom_func_args: custom reward function arguments ''' self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.build_nn() self.num_episodes = num_episodes self.trajectory_len = trajectory_len self.model = DQN() self.replay = ReplayMemory(10000) self.steps_done = 0 self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.clip_grad=clip_grad self.rewards = [] self.modules = [] self.env = ChainAgent(inventory_level=10, fix_delay=1, max_num_steps=MAX_STEPS + 10, demand_generation_function=self.demand_generation_function, custom_func=custom_func, custom_func_args=custom_func_args) def select_action(self, state): ''' Implementation of e-greedy approach :param state: input state to choose appropriate action :return: Tensor: action ''' state = torch.Tensor(state)[None, :] sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * self.steps_done / EPS_DECAY) self.steps_done += 1 if sample > eps_threshold: # print('greedy') with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1) else: # print('random') return torch.tensor([[random.randrange(N_ACTIONS)]], device=self.device, dtype=torch.long) def demand_generation_function(self): ''' Default function to generate demand :return: int: demand level ''' return np.random.randint(0, 10) def optimize_model(self): ''' Method of optimizing parameters of neural net :return: ''' if len(self.replay) < BATCH_SIZE: return transitions = self.replay.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) state_action_values = self.policy_net(state_batch).gather(1, action_batch) next_state_values = torch.zeros(BATCH_SIZE, device=self.device) # next_state_values = self.target_net(next_states).max(1)[0].detach() next_state_values = self.policy_net(next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() if self.clip_grad: for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def build_nn(self): ''' Building torch graph :return: ''' self.policy_net = DQN().to(self.device) # self.target_net = DQN().to(self.device) # self.target_net.load_state_dict(self.policy_net.state_dict()) # self.target_net.eval() def get_stat(self, state, next, reward, action): ''' Some visualisation :param state: :param next: :param reward: :param action: :return: ''' print('=====') print('DEM: ', self.env.demand_next, 'ST: ', state, ' -> ', action) print('NXST: ', next, 'REW: ', reward) print('=====') print() def run(self): ''' Main loop of training. Iterates over num_episodes * trajectory len steps :return: ''' for i_episode in range(self.num_episodes): state = self.env.reset() rewards = 0 for step in range(self.trajectory_len): action = self.select_action(torch.Tensor(state)) next_state, reward, done, _ = self.env.step(action.item()) reward *= 1. rewards += reward reward = torch.tensor([reward], device=self.device) self.replay.push(torch.Tensor([state]), action, torch.Tensor([next_state]), reward) state = next_state self.optimize_model() if done: break self.rewards.append(rewards) if i_episode % TARGET_UPDATE == 0: print(i_episode, ' : ', np.array(self.rewards[-100:]).mean())