def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 32 #self.BATCH_SIZE = 5 self.GAMMA = 0.95 #self.SAMPLE_ALPHA = 0.5 #self.SAMPLE_EPISLON = 0. #self.SAMPLE_BETA = 0. #self.SAMPLE_S = 44.8 self.SAMPLE_S = 10.0 self.SAMPLE_Q = 1.0 LEARNING_RATE = 1e-3 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.Adam(self.net.parameters(), lr=LEARNING_RATE)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore
def __init__(self, memory, env): # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 32 self.GAMMA = 0.95 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. self.plotter = Plotter(folder='DQN/plot/cartpole_simple/singleP') #LEARNING_RATE = 0.00025 LEARNING_RATE = 1e-3 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.EPS_START = 1. self.EPS_END = 0.05 self.EPS_DECAY = 200 # DECAY larger: slower self.MEMORY_SIZE = 5000 self.steps_done = 0 self.net = DQN() # Deep Net self.net.setOptimizer( optim.Adam(self.net.parameters(), lr=LEARNING_RATE)) #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, # momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, # eps=MIN_SQUARED_GRAD)) self.memory = memory self.env = env
def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 32 self.GAMMA = 0.95 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. #LEARNING_RATE = 0.00025 LEARNING_RATE = 1e-3 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.Adam(self.net.parameters(), lr=LEARNING_RATE)) #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, # momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, # eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore
class Evaluator: def __init__(self, memory, env): # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 32 self.GAMMA = 0.95 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. self.plotter = Plotter(folder='DQN/plot/cartpole_simple/singleP') #LEARNING_RATE = 0.00025 LEARNING_RATE = 1e-3 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.EPS_START = 1. self.EPS_END = 0.05 self.EPS_DECAY = 200 # DECAY larger: slower self.MEMORY_SIZE = 5000 self.steps_done = 0 self.net = DQN() # Deep Net self.net.setOptimizer( optim.Adam(self.net.parameters(), lr=LEARNING_RATE)) #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, # momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, # eps=MIN_SQUARED_GRAD)) self.memory = memory self.env = env def behavior_policy(self, state): # We can add epislon here to decay the randomness # We store the tensor of size 1x1 sample = random.random() eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ math.exp(-1. * self.steps_done / self.EPS_DECAY) if sample > eps_threshold: return self.policy(state) else: return self.env.action_space.sample() def policy(self, state): # return tensor of size 1x1 res = self.net(Variable(state, volatile=True).type(FloatTensor)).data return res.max(1)[1][0] #return res.max(1)[1].view(1,1) #return self.net(Variable(state, volatile=True).type(FloatTensor))\ # .data.max(1)[1].view(1,1) def minibatch(self, exp_replay, pretrain=False): batch = random.sample(list(exp_replay), self.BATCH_SIZE) unzipped = list(zip(*batch)) state_batch = Variable(torch.from_numpy(np.array( unzipped[0])).type(FloatTensor), volatile=True) # here previously no type conversion, and result in error action_batch = Variable(torch.from_numpy(np.array( unzipped[1])).type(LongTensor), volatile=True) reward_batch = Variable(torch.from_numpy(np.array( unzipped[2])).type(FloatTensor), volatile=True) target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.from_numpy(np.array( unzipped[4])).type(FloatTensor), volatile=True) next_state_batch = Variable(torch.from_numpy(np.array( unzipped[3])).type(FloatTensor), volatile=True) # here previously no type conversion, and result in error next_state_values = self.net(next_state_batch).max(1)[0].unsqueeze( 1) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def run(self): i = 0 while True: i = i + 1 time_t = 0 while True: time_t += 1 state = self.env.get_state() state_torch = torch.from_numpy(state).type( FloatTensor) # convert to torch and normalize state_torch = state_torch.unsqueeze(0).type(FloatTensor) action = self.behavior_policy(state_torch) next_state, r, done = self.env.step(action) # 0.03s if len(self.memory) == self.MEMORY_SIZE: self.memory.pop(0) self.memory.append((state, [action], [r], \ next_state, [1-done]) ) if len(self.memory) < 1000: i = 0 if done: break else: continue #batch_tuple = self.minibatch(self.memory) #loss = self.net.optimize(batch_tuple) if done: self.steps_done += 1 print('episode: {}, score: {}'.format(i, time_t)) self.plotter.plot_train_rewards(time_t) break if len(self.memory) >= 1000: batch_tuple = self.minibatch(self.memory) loss = self.net.optimize(batch_tuple) self.plotter.terminate()
class Evaluator(multiprocessing.Process): def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 32 #self.BATCH_SIZE = 5 self.GAMMA = 0.95 #self.SAMPLE_ALPHA = 0.5 #self.SAMPLE_EPISLON = 0. #self.SAMPLE_BETA = 0. #self.SAMPLE_S = 44.8 self.SAMPLE_S = 10.0 self.SAMPLE_Q = 1.0 LEARNING_RATE = 1e-3 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.Adam(self.net.parameters(), lr=LEARNING_RATE)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): #batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*exp_replay)) state_batch = Variable(torch.from_numpy(np.array( unzipped[0])).type(FloatTensor), volatile=True) action_batch = Variable(torch.from_numpy(np.array( unzipped[1])).type(LongTensor), volatile=True) reward_batch = Variable(torch.from_numpy(np.array( unzipped[2])).type(FloatTensor), volatile=True) target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.from_numpy(np.array( unzipped[4])).type(FloatTensor), volatile=True) next_state_batch = Variable(torch.from_numpy(np.array( unzipped[3])).type(FloatTensor), volatile=True) next_state_values = self.targetNet(next_state_batch).max( 1)[0].unsqueeze(1) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) # calculate the probability for each transition # calculate distance matrix state_feature_batch = self.targetNet.getstate(state_batch) inner_product = state_feature_batch.matmul( state_feature_batch.transpose(1, 0)) state_feature_batch_l2 = (state_feature_batch**2).sum( dim=1, keepdim=True).expand_as(inner_product) distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose( 1, 0) - 2 * inner_product # calculate Q value ditance matrix # Here use target value to calculate Q_dist_matrix = target_batch.expand_as(distance_matrix) Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose( 1, 0) # not absolute value Q_dist_matrix = Q_dist_matrix.abs() #print('distance q') #print(Q_dist_matrix.data) # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j]) # only consider same actions Action_Mask = (action_batch.expand_as(distance_matrix)) == ( action_batch.transpose(1, 0).expand_as(distance_matrix)) Mask = (distance_matrix.data <= (self.SAMPLE_S)) & (Q_dist_matrix.data <= self.SAMPLE_Q) & Action_Mask.data Cluster = [] #print('mask') counter = 0 while True: # clustering by VERTEX-COVER-ALL-VERTEX, always find largest degree #print('counter = {}' . format(counter)) counter += 1 Number = Mask.sum(dim=1) value, indx = Number.max(dim=0) #print('indx= {}' . format(indx)) if value[0] == 0: # already empty break v = Mask[indx] #print(v) #print(Mask) Cluster.append(v) # delete vertices Delete = v.expand_as(Mask) | v.transpose(1, 0).expand_as(Mask) Delete = Delete ^ 1 #Delete = v.transpose(1,0).matmul(v) ^ 1 #print(Delete) Mask = Mask & Delete k = len(Cluster) Cluster = torch.cat(Cluster) #print('cluster') #print(Cluster) Number = Cluster.sum(dim=1).type(LongTensor) probability_batch = torch.ones(k) / float(k) cluster_is = torch.multinomial(probability_batch, self.BATCH_SIZE, replacement=True) # convert the cluster indices to number of items in each cluster Sample_num = torch.eye(k).index_select( 0, cluster_is).sum(dim=0).type(LongTensor) #N = Cluster[0].size()[0] # number of vertices state_sample = [] action_sample = [] target_sample = [] for i in range(k): n = Sample_num[i] N = Number[i] if n == 0: continue cluster = Cluster[i] # get nonzero indices v_indices = cluster.nonzero().squeeze(1) if n == N: # pick up all state_sample.append(state_batch.index_select(0, v_indices)) action_sample.append(action_batch.index_select(0, v_indices)) target_sample.append(target_batch.index_select(0, v_indices)) continue prob = torch.ones(v_indices.size()) / n if n < N: # uniformly pick v_indices_is = torch.multinomial(prob, n) v_indices = v_indices.index_select(0, v_indices_is) state_sample.append(state_batch.index_select(0, v_indices)) action_sample.append(action_batch.index_select(0, v_indices)) target_sample.append(target_batch.index_select(0, v_indices)) continue # uniformly pick with replacement v_indices_is = torch.multinomial(prob, n, replacement=True) v_indices = v_indices.index_select(0, v_indices_is) state_sample.append(state_batch.index_select(0, v_indices)) action_sample.append(action_batch.index_select(0, v_indices)) target_sample.append(target_batch.index_select(0, v_indices)) state_batch = torch.cat(state_sample) action_batch = torch.cat(action_sample) target_batch = torch.cat(target_sample) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): os.system("taskset -p 0xff %d" % os.getpid()) pretrain = True i = 0 while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping...') time.sleep(1.) for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) #self.semaphore.acquire() #memory = copy.deepcopy(self.memory) memory = self.memory #self.semaphore.release() if len(memory) < 1000: continue i += 1 print('training... {}'.format(i)) batch_tuple = self.minibatch(memory, pretrain) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True #if i == 50: # pretrain = False pretrain = False
class Evaluator(multiprocessing.Process): def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 32 self.GAMMA = 0.95 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. #LEARNING_RATE = 0.00025 LEARNING_RATE = 1e-3 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.Adam(self.net.parameters(), lr=LEARNING_RATE)) #self.net.setOptimizer(optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, # momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, # eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): batch = random.sample(list(exp_replay), self.BATCH_SIZE) unzipped = list(zip(*batch)) state_batch = Variable(torch.from_numpy(np.array( unzipped[0])).type(FloatTensor), volatile=True) # here previously no type conversion, and result in error action_batch = Variable(torch.from_numpy(np.array( unzipped[1])).type(LongTensor), volatile=True) reward_batch = Variable(torch.from_numpy(np.array( unzipped[2])).type(FloatTensor), volatile=True) target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.from_numpy(np.array( unzipped[4])).type(FloatTensor), volatile=True) next_state_batch = Variable(torch.from_numpy(np.array( unzipped[3])).type(FloatTensor), volatile=True) # here previously no type conversion, and result in error next_state_values = self.targetNet(next_state_batch).max( 1)[0].unsqueeze(1) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop os.system("taskset -p 0xff %d" % os.getpid()) pretrain = True i = 0 while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping...') time.sleep(1.) for step_i in range(1, self.TRAIN_MAX + 1): memory = self.memory if len(memory) < 1000: continue i += 1 print('training... {}'.format(i)) batch_tuple = self.minibatch(memory, pretrain) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True pretrain = False
import os def wait(T): for i in list(range(T))[::-1]: print(i + 1) time.sleep(1) if __name__ == '__main__': # hyperparameters os.system( "taskset -p 0xff %d" % os.getpid() ) #https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy MEMORY_SIZE = 5000 imp_net = DQN() # populate memory # let improver populate first manager = SyncManager() manager.start() memory = ReplayMemory(MEMORY_SIZE) s = multiprocessing.Semaphore(1) #memory = multiprocessing.Queue(MEMORY_SIZE) memory = manager.list() shared = manager.dict({'SENT_FLAG': True, 'weights': None}) #shared = manager.dict({'memory':memory, 'SENT_FLAG':True, 'weights':None}) #improver = Improver(imp_net, shared, myGym(), s) improver = Improver(imp_net, MEMORY_SIZE, memory, shared, myGym(), s) # improver is executed by the main process evaluator = Evaluator(memory, shared, s)