def prepare_batch(self, target_network, q_network): batch_size = min(self.length, self.args.batch_size) sample = self.memory.sample(batch_size) s = t.tensor(sample['obs']) a = t.tensor(sample['act']) r = t.tensor(sample['rew']) ns = t.tensor(sample['next_obs']) term = t.tensor(sample['terminal']) states = s.permute(0, 3, 1, 2).to(Device.get_device()) actions = a.type(t.int64).to(Device.get_device()) rewards = r.to(Device.get_device()) next_states = ns.permute(0, 3, 1, 2).to(Device.get_device()) terminals = term.to(Device.get_device()) indexes = sample["indexes"] with t.no_grad(): target = rewards + terminals * self.args.gamma * target_network( next_states).max() predicted = q_network(states).gather(1, actions) new_priorities = f.smooth_l1_loss(predicted, target, reduction='none').cpu().numpy() new_priorities[new_priorities < 1] = 1 self.memory.update_priorities(indexes, new_priorities) return states, actions, rewards, next_states, terminals
def getDeviceForDynAnalysis(): dev_list = Device.get_devices_list() devNum = len(dev_list) if devNum <= 0: logger.error("No device has been detected! Connect your device and restart the application!") return None if devNum == 1: return Device.get_device(dev_list[0]) choice = None if devNum > 1: print "Select the device to use for analysis:\n" for i in xrange(0, devNum): print "%d. %s\n" % ((i + 1), dev_list[i]) while not choice: try: choice = int(raw_input()) if choice not in range(1, devNum+1): choice = None print 'Invalid choice! Choose right number!' except ValueError: print 'Invalid Number! Choose right number!' return Device.get_device(dev_list[choice - 1])
def getDeviceForDynAnalysis(): dev_list = Device.get_devices_list() devNum = len(dev_list) if devNum <= 0: logger.error( "No device has been detected! Connect your device and restart the application!" ) return None if devNum == 1: return Device.get_device(dev_list[0]) choice = None if devNum > 1: print "Select the device to use for analysis:\n" for i in xrange(0, devNum): print "%d. %s\n" % ((i + 1), dev_list[i]) while not choice: try: choice = int(raw_input()) if choice not in range(1, devNum + 1): choice = None print 'Invalid choice! Choose right number!' except ValueError: print 'Invalid Number! Choose right number!' return Device.get_device(dev_list[choice - 1])
def prepare_input_for_f_backup(node, action, reward): memory = node.tensors.memory child_memory = node.variables.children[action].tensors.memory action = to_one_hot(action, SIMULATOR.n_actions).to(Device.get_device()) reward = t.tensor([reward]).float().to(Device.get_device()) return memory, child_memory, action, reward
def performer(idx, model, SIMULATOR): # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) q_network = deepcopy(model) q_network.to(Device.get_device()) q_network.eval() simulator = SIMULATOR() state = simulator.reset() episode_reward = 0 terminal = False while not terminal: action = q_network(as_tensor(state)).argmax().item() next_state, reward, terminal = simulator.step(action) episode_reward += reward state = next_state return episode_reward
def state_to_tensor(self, state): key = str(state) tensor = self.tensor_cache.get(key) if tensor is None: tensor = SIMULATOR.state_to_tensor(state).to(Device.get_device()) self.tensor_cache[key] = tensor return tensor
def calculate_loss(training_data, action, args): # find the predictions, embeddings and sampled actions predictions, logits, actions = training_data # duplicate action len(predictions) times to get loss after each simulation action = t.tensor([action] * len(predictions)).long().to( Device.get_device()) predictions = t.stack(predictions) # Compute cross entropy loss to train differentiable parts loss = f.cross_entropy(predictions[-1].unsqueeze(0), action[-1].unsqueeze(0)) loss += args.beta * t.sum( t.softmax(predictions[-1], dim=0) * t.log_softmax(predictions[-1], dim=0)) # Compute decrease in loss after each simulation l_m = f.cross_entropy(predictions, action, reduction='none').clone().detach() r_m = l_m[:-1] - l_m[1:] # compute geometric sum for difference in loss for i in reversed(range(0, len(r_m) - 1)): r_m[i] = r_m[i] + args.gamma * r_m[i + 1] # calculate loss for tree search actions for l_m, logits_m, action_m in zip(r_m, logits, actions): action_m = t.tensor(action_m).long().to(Device.get_device()) # find logits logits_m = t.stack(logits_m) # find negative likelihood to minimise negative_log_likelihood = f.cross_entropy( logits_m, action_m, reduction='sum') * l_m # add it to loss loss += negative_log_likelihood return loss
def collector(idx, shared_model, shared_dataset, hyperparameters, lock): try: writer = SummaryWriter('runs/{}/collector:{:02}'.format( datetime.now().strftime("%d|%m_%H|%M"), idx)) logging.basicConfig(filename='logs/collector:{:02}.log'.format(idx), filemode='w', format='%(message)s', level=logging.DEBUG) # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) local_model = deepcopy(shared_model) local_model.to(Device.get_device()) local_model.eval() simulator = SIMULATOR() for itr in tqdm(count(), position=idx, desc='collector:{:02}'.format(idx)): local_model.load_state_dict(shared_model.state_dict()) state = simulator.reset() episode_reward = 0 for i in range(50): # Find the expert action for input belief expert_action, _ = expert(state, hyperparameters) lock.acquire() shared_dataset.append((state, expert_action)) lock.release() # Simulate the learner's action action, _ = local_model.search(state, hyperparameters) state, reward, terminal = simulator.step(action) episode_reward += reward if terminal: break logging.debug('Episode reward: {:.2f}'.format(episode_reward)) writer.add_scalar('episode_reward', episode_reward, itr) writer.close() except KeyboardInterrupt: print('exiting collector:{:02}'.format(idx))
def perform(self,args): # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(1) q_network = deepcopy(self.model) q_network.to(Device.get_device()) q_network.eval() num_reached = 0 for n in range(args.n_tests): state = self.simulator.reset() state_processed = np.concatenate((state.front_rgb,state.wrist_rgb),axis=2) episode_reward = 0 terminal = False for i in range(800): if np.random.RandomState().rand() < 0.1: action = np.random.RandomState().randint(self.simulator.n_actions()) else: action = q_network(as_tensor(state_processed)).argmax().item() next_state, reward, terminal = self.simulator.step(action,state) episode_reward += reward state_processed = np.concatenate((next_state.front_rgb,next_state.wrist_rgb),axis=2) state = next_state if (terminal): print("\nTrial {} reached the goal!".format(n+1)) num_reached += 1 break print("\nEpisode reward: {}".format(episode_reward)) print("\n\nSuccess rate: {}/{}".format(num_reached,args.n_tests))
def worker(idx, solver, args): n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) solver.to(Device.get_device()) rewards = [] with t.no_grad(): for _ in tqdm(range(args.n_samples), position=idx, desc='worker_{:02}'.format(idx), file=sys.stdout): rewards.append(performer(solver, args)) return rewards
def main(): dev = Device.get_device("303195BA0D4D00EC") messages = Queue.Queue() seccon_producer = SecconMessageProducer(messages, dev) seccon_consumer = SecconMessageProcessor(messages) seccon_producer.setDaemon(False) seccon_consumer.setDaemon(False) seccon_producer.start() seccon_consumer.start() time.sleep(60) print "Time is finished!!!" seccon_producer.stopThread() seccon_consumer.stopThread() seccon_producer.join() seccon_consumer.join()
def optimiser(idx, shared_model, shared_dataset, hyperparameters, lock): try: writer = SummaryWriter('runs/{}/optimiser:{:02}'.format(datetime.now().strftime("%d|%m_%H|%M"), idx)) logging.basicConfig(filename='logs/optimiser:{:02}.log'.format(idx), filemode='w', format='%(message)s', level=logging.DEBUG) optimiser = t.optim.SGD(params=shared_model.parameters(), lr=hyperparameters.lr) # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) local_model = deepcopy(shared_model) local_model.to(Device.get_device()) local_model.train() for itr in tqdm(count(), position=idx, desc='optimiser:{:02}'.format(idx)): # Sync local model with shared model if itr % hyperparameters.sync_frequency == 0: local_model.load_state_dict(shared_model.state_dict()) # Sample a data point from dataset state, expert_action = choice(shared_dataset) # Find the predicted action action, training_info = local_model.search(state, hyperparameters) # Optimise for the sample loss = calculate_loss(training_info, expert_action, hyperparameters) optimise_model(shared_model, local_model, loss, optimiser, lock) # Log the results logging.debug('Sample loss: {:.2f}'.format(loss.item())) writer.add_scalar('loss/sample_loss', loss.item(), itr) writer.close() except KeyboardInterrupt: print('exiting optimiser:{:02}'.format(idx))
def optimiser(idx, shared_model, SIMULATOR, args, lock): try: writer = SummaryWriter('runs/{}/optimiser:{:02}'.format(datetime.now().strftime("%d|%m_%H|%M"), idx)) logging.basicConfig(filename='logs/optimiser:{:02}.log'.format(idx), filemode='w', format='%(message)s', level=logging.DEBUG) sgd = t.optim.SGD(params=shared_model.parameters(), lr=args.lr) # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) q_network = deepcopy(shared_model) q_network.to(Device.get_device()) q_network.train() target_network = deepcopy(q_network) target_network.to(Device.get_device()) target_network.eval() buffer = deque(maxlen=args.buffer_size) simulator = SIMULATOR() for itr in tqdm(count(), position=idx, desc='optimiser:{:02}'.format(idx)): state = simulator.reset() episode_reward = 0 for e in count(): if np.random.RandomState().rand() < max(args.eps ** itr, args.min_eps): action = np.random.RandomState().randint(simulator.n_actions()) else: action = q_network(as_tensor(state)).argmax().item() next_state, reward, terminal = simulator.step(action) buffer.append(transition_to_tensor(state, action, reward, next_state, terminal)) episode_reward += reward state = next_state # Sample a data point from dataset batch = prepare_batch(buffer, args.batch_size) # Sync local model with shared model q_network.load_state_dict(shared_model.state_dict()) # Calculate loss for the batch loss = calculate_loss(q_network, target_network, batch, args) # Optimise for the batch loss = optimise_model(shared_model, q_network, loss, sgd, args, lock) # Log the results logging.debug('Batch loss: {:.2f}'.format(loss)) writer.add_scalar('batch/loss', loss, e) if terminal: break logging.debug('Episode reward: {:.2f}'.format(episode_reward)) writer.add_scalar('episode_reward', episode_reward, itr) writer.close() if itr % args.target_update_frequency == 0: target_network.load_state_dict(q_network.state_dict()) except KeyboardInterrupt: print('exiting optimiser:{:02}'.format(idx))
def as_tensor(x, dtype=t.float32): return t.tensor(x, dtype=dtype, device=Device.get_device())
solvers = {'mctsnet': MCTSnet(), 'mcts': MCTS()} if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--solver', dest='solver', default='mctsnet', help='Solver to use') parser.add_argument('--load_model', dest='load_model', default='models/checkpoint.model', help='Path to load model file') parser.add_argument('--n_simulations', dest='n_simulations', default=10, type=int, help='Number of tree simulations') args = parser.parse_args() args.training = False if t.cuda.is_available(): Device.set_device(0) model = solvers[args.solver] model.load(args.load_model) model.to(Device.get_device()) with t.no_grad(): print('Episode reward:', performer(model, args, render=True))
def optimise(idx, shared_model, queues, args, lock): try: writer = SummaryWriter('runs/o{}'.format(idx)) logging.basicConfig(filename='logs/optimiser:{:02}.log'.format(idx), filemode='w', format='%(message)s', level=logging.DEBUG) sgd = t.optim.Adam(params=shared_model.parameters(), lr=args.lr) # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(0) q_network = deepcopy(shared_model) q_network.to(Device.get_device()) q_network.train() target_network = deepcopy(q_network) target_network.to(Device.get_device()) target_network.eval() buffer = ReplayBuffer(args) for itr in tqdm(count(), position=idx, desc='optimiser:{:02}'.format(idx)): buffer.load_queues(queues, q_network, target_network, lock, args) while (len(buffer) < min( args.n_workers * args.episode_length * args.warmup, args.buffer_size / 2)): buffer.load_queues(queues, q_network, target_network, lock, args) continue # Sample a data point from dataset batch = buffer.prepare_batch(target_network, q_network) # Sync local model with shared model q_network.load_state_dict(shared_model.state_dict()) # Calculate loss for the batch loss = calculate_loss(q_network, target_network, batch, args, Device.get_device()) # Optimise for the batch loss = optimise_model(shared_model, q_network, loss, sgd, args, lock) # Log the results logging.debug('Batch loss: {:.2f}, Buffer size: {}'.format( loss, len(buffer))) writer.add_scalar('Batch loss', loss, itr) if itr % args.target_update_frequency == 0: target_network.load_state_dict(q_network.state_dict()) writer.close() except KeyboardInterrupt: print('exiting optimiser:{:02}'.format(idx))
def as_tensor(x, dtype=t.float32): return t.tensor(x, dtype=dtype, device=Device.get_device(), requires_grad=False)