def expert_policy(idx, n_samples, args): data = [] my_simulator = SIMULATOR() progress = tqdm(range(n_samples), position=idx, desc='worker_{:02}'.format(idx)) while len(data) < n_samples: state = my_simulator.reset() root = None for e in range(50): action, root = MCTS.search(state, args, root=root) data.append((state, action)) state, reward, terminal = my_simulator.step(action) root = root.children[action] if terminal: break progress.update(1) if not os.path.exists(args.dir): os.makedirs(args.dir) file = open('{}/{:02}.data'.format(args.dir, idx), 'wb') pickle.dump(data, file) file.close()
def collector(idx, shared_model, shared_dataset, hyperparameters, lock): try: writer = SummaryWriter('runs/{}/collector:{:02}'.format( datetime.now().strftime("%d|%m_%H|%M"), idx)) logging.basicConfig(filename='logs/collector:{:02}.log'.format(idx), filemode='w', format='%(message)s', level=logging.DEBUG) # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) local_model = deepcopy(shared_model) local_model.to(Device.get_device()) local_model.eval() simulator = SIMULATOR() for itr in tqdm(count(), position=idx, desc='collector:{:02}'.format(idx)): local_model.load_state_dict(shared_model.state_dict()) state = simulator.reset() episode_reward = 0 for i in range(50): # Find the expert action for input belief expert_action, _ = expert(state, hyperparameters) lock.acquire() shared_dataset.append((state, expert_action)) lock.release() # Simulate the learner's action action, _ = local_model.search(state, hyperparameters) state, reward, terminal = simulator.step(action) episode_reward += reward if terminal: break logging.debug('Episode reward: {:.2f}'.format(episode_reward)) writer.add_scalar('episode_reward', episode_reward, itr) writer.close() except KeyboardInterrupt: print('exiting collector:{:02}'.format(idx))
def search(state, args, root=None): if root is None: root = Node(state) for i in range(args.n_simulations): node = root path = [] # Start simulation and add a new child terminal = False while not terminal: # Choose which branch to explore/exploit based on embedding memory Q = node.Q + MCTS.c * np.sqrt(np.log(node.N) / node.N_a) action = int(np.argmax(Q)) # simulate with action to get next state state, reward, terminal = SIMULATOR.simulate(node.state, action) path.append((node, action, reward)) # keep on traversing if the child exists if node.children.get(action) is None: # add new child node.children[action] = Node(state, terminal) break else: node = node.children[action] # backup values through the path to root for node, action, reward in reversed(path): node.N += 1 node.N_a[action] += 1 node.Q[action] = node.Q[action] + (reward + np.max(node.children[action].Q) - node.Q[action]) / node.N_a[action] return int(np.argmax(root.Q)), root
def search(self, state, args): root = self.new_node(state) predictions = [self.f_readout(root.tensors.memory)] logits = [] actions = [] for i in range(args.n_simulations): node = root path = [] logits_m = [] actions_m = [] # Start simulation and add a new child terminal = False while not terminal: # Choose which branch to explore/exploit based on node memory p_actions = self.f_policy(node) action = Categorical(logits=p_actions).sample().item() # store embedding and action for policy gradient if args.training: logits_m.append(p_actions) actions_m.append(action) # simulate with action to get next state next_state, reward, terminal = SIMULATOR.simulate( node.variables.state, action) path.append(Path(node, action, reward)) # if action and observation branch exists, traverse to the next node and add the new state if node.variables.children.get(action) is None: node.variables.children[action] = self.new_node(next_state) break # else, create one else: node = node.variables.children[action] # backup values through the path to root for node, action, reward in reversed(path): node.tensors.memory = self.f_backup( *prepare_input_for_f_backup(node, action, reward)) node.tensors.children[action] = node.variables.children[ action].tensors.memory # store predictions after m_th step predictions.append(self.f_readout(root.tensors.memory)) # store logits and action for the m_th step logits.append(logits_m) actions.append(actions_m) return Categorical( logits=predictions[-1]).sample().item(), (predictions, logits, actions)
def __init__(self): super().__init__() channels, _, _ = SIMULATOR.tensor_shape() self.memory = nn.Sequential(nn.Conv2d(channels, 64, 1, 1), ResidualConv(64), ResidualConv(64), nn.Conv2d(64, 128, 1, 1), nn.AdaptiveMaxPool2d((1, 1)), nn.Flatten(), nn.Linear(128, d_memory))
def state_to_tensor(self, state): key = str(state) tensor = self.tensor_cache.get(key) if tensor is None: tensor = SIMULATOR.state_to_tensor(state).to(Device.get_device()) self.tensor_cache[key] = tensor return tensor
def __init__(self, state, terminal=False): self.state = state self.children = {} self.N = SIMULATOR.n_actions self.N_a = np.ones(SIMULATOR.n_actions) self.Q = np.zeros(SIMULATOR.n_actions) if not terminal: for i in range(SIMULATOR.n_actions): self.Q[i] = SIMULATOR.rollout(state, i) self.terminal = terminal
def performer(solver, args, render=False): my_simulator = SIMULATOR() state = my_simulator.reset() episode_reward = 0 if render: my_simulator.render() for i in range(MAX_EPISODE_LENGTH): action, _ = solver.search(state, args) state, reward, terminal = my_simulator.step(action) if render: print(SIMULATOR.ACTIONS[action], reward) my_simulator.render() episode_reward += reward if terminal: break return episode_reward
def run_exper(model, steps, get_features, pre_proc_features): from environment import SIMULATOR # initializing our environment my_sim = SIMULATOR() # beginning of an episode state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) r_tup, e_tup, rover_poss = [], [], [] # main loop prev_input = None total_moves = 0 MAX_MOVES = 25 for i in range(steps): total_moves += 1 start = time.perf_counter() cur_input = observation x = cur_input.astype( np.float).ravel() if prev_input is not None else np.zeros(70) x = x[10:80] if prev_input is not None else x x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)]) x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)]) x, rov_pos = get_rover_pos(x, r_tup, e_tup, rover_poss) x = np.array(x) rover_poss.append(rov_pos) """ x = x[x != 0] if(len(x) == 1): x = np.zeros(4) x = x.tolist() x.append(-7.) x = np.array(x) """ #print_map(x) x_t = pre_proc_features.fit_transform(x.reshape(-1, 1)) x_t = x_t.reshape(1, INPUT_SIZE)[0] print("Shape = ", x_t.shape) prev_input = cur_input # forward the policy network and sample action according to the proba distribution #print_map(x) proba = model.predict(np.expand_dims(x_t, axis=1).T) end = time.perf_counter() action = proba[0].argmax() print("Time taken = ", end - start) #run one step state_temp, reward, done, r_tup, e_tup = my_sim.step(action) observation = my_sim.state_to_tensor(state_temp) my_sim.render() time.sleep(1) if total_moves == MAX_MOVES: total_moves = 0 done = True # if episode is over, reset to beginning if done: state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) my_sim.render() rover_poss = []
def run_exper(model, steps, get_features, pre_proc_features): r_tup, e_tup = [], [] rover_poss = [] total_stats = {'total': 0, 'good': 0} from environment import SIMULATOR # initializing our environment my_sim = SIMULATOR() # beginning of an episode state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) state_obs = observation total_moves = 0 # main loop prev_input = None for i in range(steps): # preprocess the observation, set input as difference between images cur_input = observation x = cur_input.astype( np.float).ravel() if prev_input is not None else np.zeros(70) x = x[10:80] if prev_input is not None else x x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)]) x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)]) prev_input = cur_input x, rover_pos = get_rover_pos(x, r_tup, e_tup, rover_poss) rover_poss.append(rover_pos) x = np.array(x) """ x = x[x != 0] if(len(x) == 1): x = np.zeros(4) x = x.tolist() x.append(-7.) x = np.array(x) """ x_t = pre_proc_features.fit_transform(x.reshape(-1, 1)) x_t = x_t.reshape(1, INPUT_SIZE)[0] # forward the policy network and sample action according to the proba distribution proba = model.predict(np.expand_dims(x_t, axis=1).T) action = proba.argmax() #run one step state_temp, reward, done, r_tup, e_tup = my_sim.step(action) observation = my_sim.state_to_tensor(state_temp) #my_sim.render() total_moves += 1 if (total_moves == MAX_STEPS): done = True total_moves = 0 # if episode is over, reset to beginning if done: total_stats['total'] += 1 so = np.asarray(state_obs).ravel().tolist() o = np.asarray(observation).ravel().tolist() #print("state obs ===============") #print(state_obs) #print("obs ===============") #print(observation) try: index_obs = so.index(7.0) except ValueError: index_obs = -1 try: index_curr = o.index(7.0) except ValueError: index_curr = -1 if (index_obs != -1 and index_curr == -1): #print("Good Game") #print(so) #print(o) total_stats['good'] += 1 state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) state_obs = observation rover_poss = [] #my_sim.render() return total_stats
parser = argparse.ArgumentParser() parser.add_argument('--data_type', default='sparse', type=str, help='Choose between encoded or sparse') args = parser.parse_args() data_type = args.data_type model = get_model(data_type) import numpy as np import gym # gym initialization from environment import SIMULATOR my_sim = SIMULATOR() state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) prev_input = None # Hyperparameters to calculate discount rewards gamma = 0.99 # initialization of variables used in the main loop x_train, y_train, y_pred, rewards, r_tup, e_tup, rover_poss = [], [], [], [], [], [], [] reward_sum = 0 episode_nb = 0 resume = True running_reward = None EPOCHS_BEFORE_SAVING = 50 moves_count = 0