class Evaluator(multiprocessing.Process): def __init__(self, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 50 self.TRANSFER = 50 self.BATCH_SIZE = 32 self.GAMMA = 0.99 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*batch)) #state_batch = torch.from_numpy(np.concatenate(list(unzipped[0]))) #state_batch = Variable(state_batch) #action_batch = torch.from_numpy(np.concatenate(list(unzipped[1]))) #action_batch = Variable(action_batch) #reward_batch = torch.from_numpy(np.concatenate(list(unzipped[2]))) #reward_batch = Variable(reward_batch, requires_grad=False) state_batch = Variable(torch.cat(list(unzipped[0])).clone()) action_batch = Variable(torch.cat(list(unzipped[1])).clone()) reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False) if pretrain: # only use reward return state_batch, action_batch, reward_batch else: #term_batch = torch.from_numpy(np.concatenate(list(unzipped[5]))) #term_batch = Variable(term_batch, volatile=True) term_batch = Variable(torch.cat(list(unzipped[5])).clone(), volatile=True) #next_action_batch = torch.from_numpy(np.concatenate(list(unzipped[4]))) #next_action_batch = Variable(next_action_batch, volatile=True) next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True) #next_state = torch.from_numpy(np.concatenate(list(unzipped[3]))) #next_state = Variable(next_state, volatile=True) #next_state_values = self.targetNet(next_state).gather(1, next_action_batch) next_state_values = self.targetNet.evaluate(list( unzipped[3])).gather(1, next_action_batch) #non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, list(unzipped[3])))) #non_final_next_states = Variable(torch.cat([s for s in list(unzipped[3]) if s is not None]), # volatile=True) #next_state_values = Variable(torch.zeros(self.BATCH_SIZE).type(Tensor)) #next_state_values[non_final_mask] = self.targetNet(non_final_next_states).gather(1, next_action_batch) #next_state_values.volatile = False #print(next_state_values) #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1) next_state_values = term_batch * next_state_values next_state_values.volatile = False #next_state_values.requires_grad = False target_batch = reward_batch + (self.GAMMA * next_state_values) return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop pretrain = True while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping... size: {}'.format(len( self.shared['memory']))) time.sleep(0.1) print('training...') for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) self.semaphore.acquire() memory = copy.deepcopy(self.shared['memory']) self.semaphore.release() if len(memory) < self.BATCH_SIZE: continue batch_tuple = self.minibatch(memory, pretrain) #print('got batch tuple') #print(batch_tuple[0].type) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True pretrain = False
class Evaluator(multiprocessing.Process): def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 50 self.TRANSFER = 50 self.BATCH_SIZE = 32 self.GAMMA = 0.99 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): #batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*exp_replay)) state_batch = Variable(torch.cat(list(unzipped[0])).clone(), volatile=True) action_batch = Variable(torch.cat(list(unzipped[1])).clone(), volatile=True) reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), volatile=True) target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.cat(list(unzipped[5])).clone(), volatile=True) next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True) next_state_values = self.targetNet.evaluate(list( unzipped[3])).gather(1, next_action_batch) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) # calculate the probability for each transition state_values = self.net(state_batch).gather(1, action_batch) probability_batch = torch.pow(torch.abs(target_batch - state_values), self.SAMPLE_ALPHA).squeeze(1) sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE) state_batch = state_batch.index_select(0, sample_is) action_batch = action_batch.index_select(0, sample_is) target_batch = target_batch.index_select(0, sample_is) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop os.system("taskset -p 0xff %d" % os.getpid()) pretrain = True while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping...') time.sleep(0.1) print('training...') for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) self.semaphore.acquire() memory = copy.deepcopy(self.memory) self.semaphore.release() if len(memory) < self.BATCH_SIZE: continue batch_tuple = self.minibatch(memory, pretrain) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True pretrain = False