def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem): self.params = params self.param_set_id = param_set_id self.status_dict = status_dict self.shared_state = shared_state self.remote_mem = remote_mem gpu = 0 torch.cuda.set_device(gpu) ep = params['env'] ap = params['actor'] lp = params['learner'] rmp = params["replay_memory"] model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)' optimizer_formula = lp["optimizer"].format('self.Q.parameters()') self.conn = psycopg2.connect(params["db"]["connection_string"]) self.conn.autocommit = True self.cur = self.conn.cursor() self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu") self.state_shape = ep['state_shape'] self.batch_size = lp['replay_sample_size'] self.action_dim = ep['action_dim'] self.q_target_sync_freq = lp['q_target_sync_freq'] self.num_q_updates = 0 self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device) self.Q = eval(model_formula) self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q self.optimizer = eval(optimizer_formula) self.replay_memory = ReplayMemory(rmp) self.train_num = 0 self.model_file_name = lp['load_saved_state'] if self.model_file_name and os.path.isfile(self.model_file_name): print(f'Loading {self.model_file_name}') saved_state = torch.load(self.model_file_name) self.Q.load_state_dict(saved_state['module']) self.optimizer.load_state_dict(saved_state['optimizer']) self.train_num = saved_state['train_num'] self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu( self.Q_target.state_dict()) self.status_dict['Q_state_dict_stored'] = True self.last_Q_state_dict_id = 1 self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id self.status_dict['train_num'] = self.train_num self.gamma_n = params['actor']['gamma']**params['actor']['num_steps']
def __init__(self, env, env_w, device, config: Config): self.env = env self.env_w = env_w self.device = device self.cfg = config self.n_actions = config.n_actions self.policy_net = config.policy_net self.target_net = config.target_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = ReplayMemory(10000) self.steps_done = 0 self.episode_durations = []
def __init__(self, env, hparams): self.hparams = hparams self.env = env self.n = env.action_space.n self.Q = DCNN(4, self.n) self.T = DCNN(4, self.n) self.T.load_state_dict(self.Q.state_dict()) self.T.eval() self.memory = ReplayMemory(hparams.memory_size) self.steps = 0 self.state = env.reset() self.optimizer = torch.optim.RMSprop(self.Q.parameters(), lr=hparams.lr, momentum=hparams.momentum) self.n_episodes = 0
def make_subset_buffer(buffer_path, max_examples=100000, frame_height=40, frame_width=40): # keep max_examples < 100000 to enable knn search # states [top of image:bottom of image,:] # in breakout - can safely reduce size to be 80x80 of the given image # try to get an even number of each type of reward small_path = buffer_path.replace('.npz', '_%06d.npz' % max_examples) if os.path.exists(small_path): print('loading small buffer path') print(small_path) load_buffer = ReplayMemory(load_file=small_path) else: load_buffer = ReplayMemory(load_file=buffer_path) print('loading prescribed buffer path') print(buffer_path) # TODO if frame size is wrong - we arent handling if load_buffer.count > max_examples: print('creating small buffer') # actions for breakout: # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] sbuffer = ReplayMemory( max_examples, frame_height=frame_height, frame_width=frame_width, agent_history_length=load_buffer.agent_history_length) # remove ends because they are scary ends = np.where(load_buffer.terminal_flags == 1)[0][1:-1] random_state.shuffle(ends) for tidx in ends: if sbuffer.count >= max_examples: print('stopping after %s examples' % sbuffer.count) continue else: # start after the last terminal i = tidx + 1 # while there isnt a new terminal flag while not load_buffer.terminal_flags[i + 1]: frame = cv2.resize(load_buffer.frames[i][:, :, None], (frame_height, frame_width)) sbuffer.add_experience( action=load_buffer.actions[i], frame=frame, reward=load_buffer.rewards[i], terminal=load_buffer.terminal_flags[i]) i += 1 if not i % 100: print(sbuffer.count) sbuffer.save_buffer(small_path) load_buffer = sbuffer assert load_buffer.count > 10 return load_buffer, small_path
def __init__(self, epsilon=0.3, memory_size=300, batch_size=16, model=navigation_model, target_update_interval=1, tau=0.005): super(DDQN_separated_net, self).__init__(epsilon=epsilon, random_can_stop=False) # Memory self.memory = ReplayMemory(memory_size) # Batch size when learning self.batch_size = batch_size # number of time steps before an update of the delayed target Q network self.target_update_interval = target_update_interval # soft update weight of the delayed Q network self.tau = tau
def load_checkpoint(self, filepath, config_handler=''): # load previously saved state file fh = open(filepath, 'rb') fdict = pickle.load(fh) fh.close() if config_handler != '': # use given config handler del fdict['ch'] self.ch = config_handler self.__dict__.update(fdict) self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble']) self.random_state = np.random.RandomState() self.random_state.set_state(fdict['state_random_state']) # TODO NOTE this does not restart at same env state self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase] self.env = self.ch.create_environment(self.seed) buffer_path = filepath.replace('.pkl', '.npz') self.memory_buffer = ReplayMemory(load_file=buffer_path) # TODO should you load the count from the memory buffer - ? # TODO what about episode number - it will be off now self.step_number = self.memory_buffer.count self.setup_eps()
def create_empty_memory_buffer(self, seed, buffer_size): return ReplayMemory( size=buffer_size, frame_height=self.frame_height, frame_width=self.frame_width, agent_history_length=self.history_length, batch_size=self.cfg['DQN']['batch_size'], num_heads=self.cfg['DQN']['n_ensemble'], bernoulli_probability=self.cfg['DQN']['bernoulli_probability'], seed=seed, use_pred_frames=self.cfg['DQN']['use_pred_frames'], # details needed for online max pooling maxpool=self.maxpool, trim_before=trim_before, trim_after=trim_after, kernel_size=kernel_size, reduction_function=reduction_fn, )
def load_memory_buffer(self, phase, load_previously_saved=True): """ phase: string should be "train" or "eval" to indicate which memory buffer to load function will load latest experience in the model_savedir/name or create a random replay buffer of specified size to start from """ assert phase in ['train', 'eval'] buffer_size = self.cfg['RUN']['%s_buffer_size' % phase] seed = self.cfg['RUN']['%s_seed' % phase] init_empty_with_random = self.cfg['DQN']['load_random_%s_buffer' % phase] self.num_random_steps = self.cfg['DQN']['num_pure_random_steps_%s' % phase] if load_previously_saved: buffer_path = self.search_for_latest_replay_buffer(phase) if buffer_path != "": print("loading buffer from past experience:%s" % buffer_path) return ReplayMemory(load_file=buffer_path) if not init_empty_with_random: # no buffer file was found, and we want an empty buffer print("creating empty replay buffer") return self.create_empty_memory_buffer(seed, buffer_size) ##################################################### # from here on - we assume we need random values # load a presaved random buffer if it is available #random_buffer_path = self.get_random_buffer_path(phase, seed) #if os.path.exists(random_buffer_path): # print("loading random replay buffer:%s"%random_buffer_path) # return ReplayMemory(load_file=random_buffer_path) #else: # no buffer file was found, and we want an empty buffer #print('did not find saved replay buffers') #print('cannot find a suitable random replay buffers... creating one - this will take some time') # did not find any checkpoints - load random buffer empty_memory_buffer = self.create_empty_memory_buffer( seed, buffer_size) #env = self.create_environment(seed) # save the random buffer #random_memory_buffer.save_buffer(random_buffer_path) return empty_memory_buffer
def gen_fake(generator, agent, trainSample, batch_size, embed_dim, device, write_item, write_target, write_reward, write_action, action_num, max_length=5, recom_length=None): for stidx in range(0, trainSample.length(), batch_size): click_batch, length, _, reward_batch, action_batch = getBatch_dis(stidx, stidx + batch_size, trainSample, embed_dim, recom_length) click_batch = click_batch.to(device) reward_batch = reward_batch.to(device) action_batch = action_batch.to(device) if recom_length == None: recom_length = action_batch.size(1) replay = ReplayMemory(generator, agent, len(length), max_length, action_num, recom_length) with torch.no_grad(): replay.init_click_sample((click_batch, length), reward_batch, action_batch) replay.gen_sample(batch_size, False) seq_samples, lengths, seq_rewards, seq_actions = replay.clicks, replay.lengths, replay.tgt_rewards, replay.actions seq_rewards = torch.round(seq_rewards) write_tensor(seq_samples, lengths, write_item, write_target, 'dis', real=False) write_tensor_reward(seq_rewards, lengths, write_reward) write_tensor_action(seq_actions, lengths, write_action) return seq_samples, lengths, seq_rewards, seq_actions
class Learner(object): def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem): self.params = params self.param_set_id = param_set_id self.status_dict = status_dict self.shared_state = shared_state self.remote_mem = remote_mem gpu = 0 torch.cuda.set_device(gpu) ep = params['env'] ap = params['actor'] lp = params['learner'] rmp = params["replay_memory"] model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)' optimizer_formula = lp["optimizer"].format('self.Q.parameters()') self.conn = psycopg2.connect(params["db"]["connection_string"]) self.conn.autocommit = True self.cur = self.conn.cursor() self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu") self.state_shape = ep['state_shape'] self.batch_size = lp['replay_sample_size'] self.action_dim = ep['action_dim'] self.q_target_sync_freq = lp['q_target_sync_freq'] self.num_q_updates = 0 self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device) self.Q = eval(model_formula) self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q self.optimizer = eval(optimizer_formula) self.replay_memory = ReplayMemory(rmp) self.train_num = 0 self.model_file_name = lp['load_saved_state'] if self.model_file_name and os.path.isfile(self.model_file_name): print(f'Loading {self.model_file_name}') saved_state = torch.load(self.model_file_name) self.Q.load_state_dict(saved_state['module']) self.optimizer.load_state_dict(saved_state['optimizer']) self.train_num = saved_state['train_num'] self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu( self.Q_target.state_dict()) self.status_dict['Q_state_dict_stored'] = True self.last_Q_state_dict_id = 1 self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id self.status_dict['train_num'] = self.train_num self.gamma_n = params['actor']['gamma']**params['actor']['num_steps'] def state_dict_to_cpu(self, state_dict): d = OrderedDict() for k, v in state_dict.items(): d[k] = v.cpu() return d def add_experience_to_replay_mem(self): while self.remote_mem.qsize(): priorities, batch = self.remote_mem.get() self.replay_memory.add(priorities, batch) def compute_loss_and_priorities(self, batch_size): indices, n_step_transition_batch, before_priorities = self.replay_memory.sample(batch_size) s = n_step_transition_batch[0].to(self.device) a = n_step_transition_batch[1].to(self.device) r = n_step_transition_batch[2].to(self.device) a_latest = n_step_transition_batch[3].to(self.device) s_latest = n_step_transition_batch[4].to(self.device) terminal = n_step_transition_batch[5].to(self.device) q = self.Q(s) q_a = q.take(self.take_offsets + a).squeeze() with torch.no_grad(): self.Q_target.eval() Gt = r + (1.0 - terminal) * self.gamma_n * self.Q_target(s_latest).take(self.take_offsets + a_latest).squeeze() td_error = Gt - q_a loss = F.smooth_l1_loss(q_a, Gt) # loss = td_error**2 / 2 # Compute the new priorities of the experience after_priorities = td_error.data.abs().cpu().numpy() self.replay_memory.set_priorities(indices, after_priorities) return loss, q, before_priorities, after_priorities, indices def update_Q(self, loss): self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.num_q_updates += 1 if self.num_q_updates % self.q_target_sync_freq == 0: self.Q_target.load_state_dict(self.Q.state_dict()) print(f'Target Q synchronized.') return True else: return False def learn(self): t = tables.LearnerData() record_type = t.get_record_type() record_insert = t.get_insert() cur = self.cur param_set_id = self.param_set_id now = datetime.datetime.now step_num = 0 target_sync_num = 0 send_param_num = 0 min_replay_mem_size = self.params['learner']["min_replay_mem_size"] print('learner waiting for replay memory.') while self.replay_memory.size() <= min_replay_mem_size: self.add_experience_to_replay_mem() time.sleep(0.01) step_num = 0 print('learner start') while not self.status_dict['quit']: self.add_experience_to_replay_mem() # 4. Sample a prioritized batch of transitions # 5. & 7. Apply double-Q learning rule, compute loss and experience priorities # 8. Update priorities loss, q, before_priorities, after_priorities, indices = self.compute_loss_and_priorities(self.batch_size) if step_num % 10 == 0: print(f'loss : {loss}') #print("\nLearner: step_num=", step_num, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r') # 6. Update parameters of the Q network(s) if self.update_Q(loss): target_sync_num += 1 if step_num % 5 == 0: self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu( self.Q_target.state_dict()) self.last_Q_state_dict_id += 1 self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id print('Send params to actors.') send_param_num += 1 # 9. Periodically remove old experience from replay memory step_num += 1 self.train_num += 1 self.status_dict['train_num'] = self.train_num # DBへデータ登録 r = record_type(param_set_id, now(), self.train_num, step_num, loss.item(), q[0].tolist(), before_priorities.tolist(), after_priorities.tolist(), indices.tolist(), target_sync_num, send_param_num) record_insert(cur, r) print('learner end') state_dict = {'module': self.Q.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_num': self.train_num} torch.save(state_dict, self.model_file_name)
class StateManager(): def __init__(self): self.reward_space = [-1, 0, 1] self.latent_representation_function = None pass def create_new_state_instance(self, config_handler, phase): self.ch = config_handler self.save_time = time.time() - 100000 self.phase = phase self.step_number = 0 self.end_step_number = -1 self.episode_number = 0 self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase] self.random_state = np.random.RandomState(self.seed) self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble']) self.episodic_reward = [] self.episodic_reward_avg = [] self.episodic_step_count = [] self.episodic_step_ends = [] self.episodic_loss = [] self.episodic_times = [] self.episodic_eps = [] self.env = self.ch.create_environment(self.seed) self.memory_buffer = self.ch.load_memory_buffer(self.phase) # TODO should you load the count from the memory buffer - ? self.step_number = self.memory_buffer.count self.setup_eps() def setup_eps(self): if self.phase == 'train': self.eps_init = self.ch.cfg['DQN']['eps_init'] self.eps_final = self.ch.cfg['DQN']['eps_final'] self.eps_annealing_steps = self.ch.cfg['DQN'][ 'eps_annealing_steps'] self.last_annealing_step = self.eps_annealing_steps + self.ch.cfg[ 'DQN']['num_pure_random_steps_train'] if self.eps_annealing_steps > 0: self.slope = -(self.eps_init - self.eps_final) / self.eps_annealing_steps self.intercept = self.eps_init - self.slope * self.ch.cfg[ 'DQN']['num_pure_random_steps_train'] def load_checkpoint(self, filepath, config_handler=''): # load previously saved state file fh = open(filepath, 'rb') fdict = pickle.load(fh) fh.close() if config_handler != '': # use given config handler del fdict['ch'] self.ch = config_handler self.__dict__.update(fdict) self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble']) self.random_state = np.random.RandomState() self.random_state.set_state(fdict['state_random_state']) # TODO NOTE this does not restart at same env state self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase] self.env = self.ch.create_environment(self.seed) buffer_path = filepath.replace('.pkl', '.npz') self.memory_buffer = ReplayMemory(load_file=buffer_path) # TODO should you load the count from the memory buffer - ? # TODO what about episode number - it will be off now self.step_number = self.memory_buffer.count self.setup_eps() def save_checkpoint(self, checkpoint_basepath): # pass in step number because we always want to use training step number as reference self.save_time = time.time() self.plot_progress(checkpoint_basepath) # TODO save this class - except for random state i assume self.memory_buffer.save_buffer(checkpoint_basepath + '.npz') # TOO big - prob need to save specifics ## preserve random state - self.state_random_state = self.random_state.get_state() save_dict = { 'episodic_reward': self.episodic_reward, 'episodic_reward_avg': self.episodic_reward_avg, 'episodic_step_count': self.episodic_step_count, 'episodic_step_ends': self.episodic_step_ends, 'episodic_loss': self.episodic_loss, 'episodic_times': self.episodic_times, 'state_random_state': self.state_random_state, 'episode_number': self.episode_number, 'step_number': self.step_number, 'phase': self.phase, 'save_time': self.save_time, 'ch': self.ch, 'episodic_eps': self.episodic_eps, } fh = open(checkpoint_basepath + '.pkl', 'wb') pickle.dump(save_dict, fh) fh.close() print('finished pickle in', time.time() - self.save_time) def end_episode(self): # catalog self.end_time = time.time() self.end_step_number = deepcopy(self.step_number) # add to lists self.episodic_reward.append(np.sum(self.episode_rewards)) self.episodic_step_count.append(self.end_step_number - self.start_step_number) self.episodic_step_ends.append(self.end_step_number) self.episodic_loss.append(np.mean(self.episode_losses)) self.episodic_times.append(self.end_time - self.start_time) try: self.episodic_eps.append(self.eps) except: self.episodic_eps = [1.0 for x in range(len(self.episodic_times))] # smoothed reward over last 100 episodes self.episodic_reward_avg.append( np.mean( self. episodic_reward[-self.ch.cfg['PLOT']['num_prev_steps_avg']:])) num_steps = self.episodic_step_count[-1] print("*** %s E%05d S%010d AH%s-R%s num random/total steps:%s/%s***" % (self.phase, self.episode_number, self.step_number, self.active_head, self.episodic_reward[-1], self.num_random_steps, num_steps)) self.episode_active = False self.episode_number += 1 def start_episode(self): self.start_time = time.time() self.random_state.shuffle(self.heads) self.active_head = self.heads[0] self.end_step_number = -1 self.episode_losses = [] self.episode_actions = [] self.episode_rewards = [] self.start_step_number = deepcopy(self.step_number) self.num_random_steps = 0 # restart counters self.terminal = False self.life_lost = True self.episode_reward = 0 state = self.env.reset() self.prev_action = 0 self.prev_reward = 0 for i in range(state.shape[0] + 1): # add enough memories to use the memory buffer # not sure if this is correct self.memory_buffer.add_experience( action=0, frame=state[ -1], # use last frame in state bc it is only nonzero one reward=0, terminal=0, end=0, ) # get correctly formatted last state batch = self.memory_buffer.get_history_minibatch(indices='last') # get state self.state = batch[0][0] if self.state.shape != (self.ch.num_prev_steps, self.memory_buffer.agent_history_length, self.memory_buffer.frame_height, self.memory_buffer.frame_width): print("start shape wrong") embed() self.episode_active = True return self.state def plot_current_episode(self, plot_basepath=''): if plot_basepath == '': plot_basepath = self.get_plot_basepath() plot_dict = { 'mean loss': self.episode_losses, 'actions': self.episode_actions, 'rewards': self.episode_rewards, } suptitle = 'E%s S%s-%s R%s' % ( self.episode_number, self.start_step_number, self.end_step_number, self.episodic_reward[-1]) plot_path = plot_basepath + '_ep%06d.png' % self.episode_number #step_range = np.arange(self.start_step_number, self.end_step_number) #self.plot_data(plot_path, plot_dict, suptitle, xname='episode steps', xdata=step_range) self.plot_data(plot_path, plot_dict, suptitle, xname='episode steps') #, xdata=step_range) ep_steps = self.end_step_number - self.start_step_number self.plot_histogram(plot_basepath + '_ep_histrewards_%06d.png' % self.episode_number, data=self.episode_rewards, bins=self.reward_space, title='rewards TR%s' % self.episode_reward) self.plot_histogram( plot_basepath + '_ep_histactions_%06d.png' % self.episode_number, data=self.episode_actions, bins=self.env.action_space, title='actions acthead:%s nrand:%s/%s' % (self.active_head, self.num_random_steps, ep_steps)) def plot_last_episode(self): ep_steps = self.end_step_number - self.start_step_number ep_states, ep_actions, ep_rewards, ep_next_states, ep_terminals, ep_masks, indexes = self.memory_buffer.get_last_n_states( ep_steps) plot_basepath = self.get_plot_basepath() + '_episode_states_frames' self.plot_episode_movie(plot_basepath, ep_states, ep_actions, ep_rewards, ep_next_states, ep_terminals, ep_masks, indexes) def plot_episode_movie(self, plot_basepath, states, actions, rewards, next_states, terminals, masks, indexes): if not os.path.exists(plot_basepath): os.makedirs(plot_basepath) n_steps = states.shape[0] print('plotting episode of length %s' % n_steps) if self.latent_representation_function == None: n_cols = 2 else: pred_next_states, zs, latents = self.latent_representation_function( states, actions, rewards, self.ch) n_cols = 4 latent_image_path = os.path.join(plot_basepath, 'latent_step_%05d.png') ep_reward = sum(rewards) movie_path = plot_basepath + '_movie_R%04d.mp4' % ep_reward print('starting to make movie', movie_path) # write frame by frame then use ffmpeg to generate movie #image_path = os.path.join(plot_basepath, 'step_%05d.png') #w_path = plot_basepath+'_write_movie_R%04d.sh'%ep_reward #a = open(w_path, 'w') #cmd = "ffmpeg -n -r 30 -i %s -c:v libx264 -pix_fmt yuv420p %s"%(os.path.abspath(image_path),os.path.abspath(movie_path)) #a.write(cmd) #a.close() #w,h = states[0,3].shape #treward = 0 #for step in range(min(n_steps, 100)): # f, ax = plt.subplots(1, n_cols) # if not step%20: # print('plotting step', step) # ax[0].imshow(states[step, 3], cmap=plt.cm.gray) # #ax[0].set_title('OS-A%s' %(actions[step])) # ax[1].imshow(next_states[step, 3], cmap=plt.cm.gray) # treward+=rewards[step] # if self.latent_representation_function != None: # ax[2].imshow(pred_next_states[step], cmap=plt.cm.gray) # z = np.hstack((zs[step,0], zs[step,1], zs[step,2])) # ax[3].imshow(z) # for aa in range(n_cols): # ax[aa].set_xticks([]) # ax[aa].set_yticks([]) # f.suptitle('%sA%sR%sT%sD%s'%(step, actions[step], rewards[step], treward, int(terminals[step]))) # plt.savefig(image_path%step) # plt.close() # generate movie directly max_frames = 5000 n = min(n_steps, max_frames) for step in range(n): if self.latent_representation_function != None: z = np.hstack((zs[step, 0], zs[step, 1], zs[step, 2])) zo = resize(z, (84, 84), cval=0, order=0) # TODO - is imwrite clipping zo since it is not a uint8? img = np.hstack( (states[step, 3], next_states[step, 3], pred_next_states[step], zo)) else: img = np.hstack((states[step, 3], next_states[step, 3])) if not step: movie = np.zeros((n, img.shape[0], img.shape[1])) latent_movie = np.zeros((n, z.shape[0], z.shape[1])) movie[step] = img latent_movie[step] = z vwrite(movie_path, movie) def plot_histogram(self, plot_path, data, bins, title=''): n, bins, _ = plt.hist(data, bins=bins) plt.xticks(bins, bins) plt.yticks(n, n) plt.xlim(min(bins), max(bins) + 1) plt.title(title) plt.savefig(plot_path) plt.close() def plot_progress(self, plot_basepath=''): if plot_basepath == '': plot_basepath = self.get_plot_basepath() det_plot_dict = { 'episodic step count': self.episodic_step_count, 'episodic time': self.episodic_times, 'mean episodic loss': self.episodic_loss, 'eps': self.episodic_eps, } suptitle = 'Details E%s S%s' % (self.episode_number, self.end_step_number) edet_plot_path = plot_basepath + '_details_episodes.png' sdet_plot_path = plot_basepath + '_details_episodes.png' if self.end_step_number > 1: #exdata = np.arange(self.episode_number) #self.plot_data(edet_plot_path, det_plot_dict, suptitle, xname='episode', xdata=exdata) #self.plot_data(sdet_plot_path, det_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) self.plot_data(edet_plot_path, det_plot_dict, suptitle, xname='episode') #, xdata=exdata) self.plot_data(sdet_plot_path, det_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) rew_plot_dict = { 'episodic reward': self.episodic_reward, 'smooth episodic reward': self.episodic_reward_avg, } suptitle = 'Reward E%s S%s R%s' % (self.episode_number, self.end_step_number, self.episodic_reward[-1]) erew_plot_path = plot_basepath + '_reward_episodes.png' srew_plot_path = plot_basepath + '_reward_steps.png' #self.plot_data(erew_plot_path, rew_plot_dict, suptitle, xname='episode', xdata=np.arange(self.episode_number)) #self.plot_data(srew_plot_path, rew_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) self.plot_data( erew_plot_path, rew_plot_dict, suptitle, xname='episode') #, xdata=np.arange(self.episode_number)) self.plot_data(srew_plot_path, rew_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) def plot_data(self, savepath, plot_dict, suptitle, xname, xdata=None): st = time.time() print('starting plot data') n = len(plot_dict.keys()) f, ax = plt.subplots(n, 1, figsize=(6, 3 * n)) #f,ax = plt.subplots(n,1) try: for xx, name in enumerate(sorted(plot_dict.keys())): if xdata is not None: ax[xx].plot(xdata, plot_dict[name]) else: ax[xx].plot(plot_dict[name]) ax[xx].set_title('%s' % (name)) ax[xx].set_ylabel(name) print(name, xname, st - time.time()) ax[xx].set_xlabel(xname) f.suptitle('%s %s' % (self.phase, suptitle)) print('end sup', st - time.time()) f.savefig(savepath) print("saved: %s" % savepath) plt.close() print('finished') except Exception: print("plot") embed() def get_plot_basepath(self): return self.ch.get_checkpoint_basepath( self.step_number) + '_%s' % self.phase def handle_plotting(self, plot_basepath='', force_plot=False): # will plot at beginning of episode #if not self.episode_number % self.ch.cfg['PLOT']['plot_episode_every_%s_episodes'%self.phase]: # dont plot first episode plot_basepath = self.get_plot_basepath() if self.episode_number: if force_plot: self.plot_current_episode(plot_basepath) self.plot_progress(plot_basepath) if self.episode_number == 1 or not self.episode_number % self.ch.cfg[ 'PLOT']['plot_episode_every_%s_episodes' % self.phase]: self.plot_current_episode(plot_basepath) if self.episode_number == 1 or not self.episode_number % self.ch.cfg[ 'PLOT']['plot_every_%s_episodes' % self.phase]: self.plot_progress(plot_basepath) def step(self, action): next_state, reward, self.life_lost, self.terminal = self.env.step( action) self.prev_action = action self.prev_reward = np.sign(reward) # the replay buffer will convert the observed state as needed self.memory_buffer.add_experience( action=action, frame=next_state[-1], reward=self.prev_reward, terminal=self.life_lost, end=self.terminal, ) self.episode_actions.append(self.prev_action) self.episode_rewards.append(self.prev_reward) self.step_number += 1 batch = self.memory_buffer.get_history_minibatch(indices='last') # get state self.state = batch[0][0] #self.state = self.memory_buffer.get_last_state() if self.state.shape[1] == 0: print('handler state chan 0') embed() def set_eps(self): # TODO function to find eps - for now use constant if self.step_number <= self.ch.cfg['DQN']['num_pure_random_steps_%s' % self.phase]: self.eps = 1.0 if self.phase == 'train': self.eps = self.eps_final if self.step_number < self.last_annealing_step: self.eps = self.slope * self.step_number + self.intercept else: self.eps = self.ch.cfg['EVAL']['eps_eval'] def random_action(self): self.num_random_steps += 1 # pass action_idx to env.action_space return self.random_state.choice(range(self.env.num_actions)) def is_random_action(self): self.set_eps() r = self.random_state.rand() if r < self.eps: return True else: return False
class DQN: def __init__(self, env, hparams): self.hparams = hparams self.env = env self.n = env.action_space.n self.Q = DCNN(4, self.n) self.T = DCNN(4, self.n) self.T.load_state_dict(self.Q.state_dict()) self.T.eval() self.memory = ReplayMemory(hparams.memory_size) self.steps = 0 self.state = env.reset() self.optimizer = torch.optim.RMSprop(self.Q.parameters(), lr=hparams.lr, momentum=hparams.momentum) self.n_episodes = 0 @torch.no_grad() def select_action(self): hparams = self.hparams start = hparams.eps_start end = hparams.eps_end time = hparams.eps_time steps = self.steps self.steps += 1 if steps < time: epsilon = start - (start - end) * steps / time else: epsilon = end sample = random.random() if sample > epsilon: return self.Q(s2t(self.state).to(device)).max(1)[1].item() else: return self.env.action_space.sample() def sample_step(self, fs_min=2, fs_max=6): """repeats a single action between fs_min and fs_max (inclusive) times""" fs = random.randint(fs_min, fs_max) action = self.select_action() r = 0 for _ in range(fs): new_state, reward, done, _ = self.env.step(action) self.memory.push(self.state, action, new_state if not done else None, reward) r += reward self.state = self.env.reset() if done else new_state if done: self.n_episodes += 1 return r def optimize(self): hparams = self.hparams transitions = self.memory.sample(hparams.batch_size) batch = Transition(*zip(*transitions)) states = torch.cat([s2t(state) for state in batch.state]).to(device) actions = torch.tensor(batch.action).unsqueeze(1).to(device) target_values = torch.tensor( batch.reward).unsqueeze(1).to(device).float() non_terminal_next_states = torch.cat([ s2t(state) for state in batch.next_state if state is not None ]).to(device) non_terminal_mask = torch.tensor([ state is not None for state in batch.next_state ]).to(device).unsqueeze(1) values = self.Q(states).gather(1, actions).float() target_values[non_terminal_mask] += hparams.gamma * self.T( non_terminal_next_states).detach().max(1)[0].float() #print(values.dtype,target_values.dtype) loss = F.smooth_l1_loss(values, target_values) self.optimizer.zero_grad() loss.backward() for param in self.Q.parameters(): param.grad.data.clamp_(-1, 1) # maybe try sign_? self.optimizer.step() return loss
</Mission>''' # Create default Malmo objects: agent_host = MalmoPython.AgentHost() try: agent_host.parse(sys.argv) except RuntimeError as e: print('ERROR:', e) print(agent_host.getUsage()) exit(1) if agent_host.receivedArgument("help"): print(agent_host.getUsage()) exit(0) net = cnn.Net() memory = ReplayMemory(1000) for episode in range(5): my_mission = MalmoPython.MissionSpec(missionXML, True) my_mission_record = MalmoPython.MissionRecordSpec() # Attempt to start a mission: max_retries = 3 for retry in range(max_retries): try: agent_host.startMission(my_mission, my_mission_record) break except RuntimeError as e: if retry == max_retries - 1: print("Error starting mission:", e) exit(1)
def train(sess, environment, actor, critic, embeddings, history_length, ra_length, buffer_size, batch_size, discount_factor, nb_episodes, filename_summary, nb_rounds, **env_args): ''' Algorithm 3 in article. ''' # Set up summary operators def build_summaries(): episode_reward = tf.Variable (0.) tf.summary.scalar ('reward', episode_reward) episode_max_Q = tf.Variable (0.) tf.summary.scalar ('max_Q_value', episode_max_Q) critic_loss = tf.Variable (0.) tf.summary.scalar ('critic_loss', critic_loss) summary_vars = [episode_reward, episode_max_Q, critic_loss] summary_ops = tf.summary.merge_all () return summary_ops, summary_vars summary_ops, summary_vars = build_summaries () sess.run (tf.global_variables_initializer ()) writer = tf.summary.FileWriter (filename_summary, sess.graph) # '2: Initialize target network f′ and Q′' actor.init_target_network () critic.init_target_network () # '3: Initialize the capacity of replay memory D' replay_memory = ReplayMemory(buffer_size) # Memory D in article replay = False start_time = time.time () for i_session in range (nb_episodes): # '4: for session = 1, M do' session_reward = 0 session_Q_value = 0 session_critic_loss = 0 # '5: Reset the item space I' is useless because unchanged. nb_env = 10 envs = np.asarray([Environment(**env_args) for i in range(nb_env)]) # u = [e.current_user for e in envs] # print(u) # input() states = np.array([env.current_state for env in envs]) # '6: Initialize state s_0 from previous sessions' # if (i_session + 1) % 10 == 0: # Update average parameters every 10 episodes # environment.groups = environment.get_groups () exploration_noise = OrnsteinUhlenbeckNoise (history_length * embeddings.size ()) for t in range (nb_rounds): # '7: for t = 1, T do' # '8: Stage 1: Transition Generating Stage' # '9: Select an action a_t = {a_t^1, ..., a_t^K} according to Algorithm 2' actions, item_idxes = actor.get_recommendation_list ( ra_length, states.reshape (nb_env, -1), # TODO + exploration_noise.get().reshape(1, -1), embeddings) # '10: Execute action a_t and observe the reward list {r_t^1, ..., r_t^K} for each item in a_t' for env, state, action, items in zip(envs, states, actions, item_idxes): sim_results, rewards, next_state = env.step (action, items) # '19: Store transition (s_t, a_t, r_t, s_t+1) in D' replay_memory.add (state.reshape (history_length * embeddings.size ()), action.reshape (ra_length * embeddings.size ()), [rewards], next_state.reshape (history_length * embeddings.size ())) state = next_state # '20: Set s_t = s_t+1' session_reward += rewards # '21: Stage 2: Parameter Updating Stage' if replay_memory.size () >= batch_size * nb_env: # Experience replay replay = True replay_Q_value, critic_loss = experience_replay (replay_memory, batch_size, actor, critic, embeddings, ra_length, history_length * embeddings.size (), ra_length * embeddings.size (), discount_factor) session_Q_value += replay_Q_value session_critic_loss += critic_loss summary_str = sess.run (summary_ops, feed_dict = {summary_vars[0]: session_reward, summary_vars[1]: session_Q_value, summary_vars[2]: session_critic_loss}) writer.add_summary (summary_str, i_session) ''' print(state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings), state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings, True)) ''' str_loss = str ('Loss=%0.4f' % session_critic_loss) print (('Episode %d/%d Reward=%d Time=%ds ' + (str_loss if replay else 'No replay')) % (i_session + 1, nb_episodes, session_reward, time.time () - start_time)) start_time = time.time () writer.close () tf.train.Saver ().save (sess, 'models.h5', write_meta_graph = False)
def init_train(): """ use args to setup inplace training """ train_data_path = args.train_buffer valid_data_path = args.valid_buffer data_dir = os.path.split(train_data_path)[0] # we are starting from scratch training this model if args.model_loadpath == "": run_num = 0 model_base_filedir = os.path.join(data_dir, args.savename + '%02d' % run_num) while os.path.exists(model_base_filedir): run_num += 1 model_base_filedir = os.path.join(data_dir, args.savename + '%02d' % run_num) os.makedirs(model_base_filedir) model_base_filepath = os.path.join(model_base_filedir, args.savename) print("MODEL BASE FILEPATH", model_base_filepath) info = { 'model_train_cnts': [], 'model_train_losses': {}, 'model_valid_cnts': [], 'model_valid_losses': {}, 'model_save_times': [], 'model_last_save': 0, 'model_last_plot': 0, 'NORM_BY': 255.0, 'MODEL_BASE_FILEDIR': model_base_filedir, 'model_base_filepath': model_base_filepath, 'model_train_data_file': train_data_path, 'model_valid_data_file': valid_data_path, 'NUM_TRAINING_EXAMPLES': args.num_training_examples, 'NUM_K': args.num_k, 'NR_LOGISTIC_MIX': args.nr_logistic_mix, 'NUM_PCNN_FILTERS': args.num_pcnn_filters, 'NUM_PCNN_LAYERS': args.num_pcnn_layers, 'ALPHA_REC': args.alpha_rec, 'ALPHA_ACT': args.alpha_act, 'ALPHA_REW': args.alpha_rew, 'MODEL_BATCH_SIZE': args.batch_size, 'NUMBER_CONDITION': args.num_condition, 'CODE_LENGTH': args.code_length, 'NUM_MIXTURES': args.num_mixtures, 'REQUIRE_UNIQUE_CODES': args.require_unique_codes, } ## size of latents flattened - dependent on architecture #info['float_condition_size'] = 100*args.num_z ## 3x logistic needed for loss ## TODO - change loss else: print('loading model from: %s' % args.model_loadpath) model_dict = torch.load(args.model_loadpath, map_location=lambda storage, loc: storage) info = model_dict['model_info'] model_base_filedir = os.path.split(args.model_loadpath)[0] model_base_filepath = os.path.join(model_base_filedir, args.savename) info['loaded_from'] = args.model_loadpath info['MODEL_BATCH_SIZE'] = args.batch_size info['DEVICE'] = DEVICE info['MODEL_SAVE_EVERY'] = args.save_every info['MODEL_LOG_EVERY_BATCHES'] = args.log_every_batches info['model_loadpath'] = args.model_loadpath info['MODEL_SAVENAME'] = args.savename info['MODEL_LEARNING_RATE'] = args.learning_rate # create replay buffer train_buffer = make_subset_buffer( train_data_path, max_examples=info['NUM_TRAINING_EXAMPLES']) valid_buffer = make_subset_buffer(valid_data_path, max_examples=int( info['NUM_TRAINING_EXAMPLES'] * .1)) valid_buffer = ReplayMemory(load_file=valid_data_path) # if train buffer is too large - make random subset # 27588 places in 1e6 buffer where reward is nonzero info['num_actions'] = train_buffer.num_actions() info['size_training_set'] = train_buffer.num_examples() info['hsize'] = train_buffer.frame_height info['wsize'] = train_buffer.frame_width info['num_rewards'] = train_buffer.num_rewards() info['HISTORY_SIZE'] = 4 rewards_weight = 1 - np.array(train_buffer.percentages_rewards()) actions_weight = 1 - np.array(train_buffer.percentages_actions()) actions_weight = torch.FloatTensor(actions_weight).to(DEVICE) rewards_weight = torch.FloatTensor(rewards_weight).to(DEVICE) info['actions_weight'] = actions_weight info['rewards_weight'] = rewards_weight # output mixtures should be 2*nr_logistic_mix + nr_logistic mix for each # decorelated channel info['num_output_mixtures'] = (2 * args.nr_logistic_mix + args.nr_logistic_mix) * info['HISTORY_SIZE'] nmix = int(info['num_output_mixtures'] / info['HISTORY_SIZE']) info['nmix'] = nmix #encoder_model = ConvVAE(info['CODE_LENGTH'], input_size=args.num_condition, # encoder_output_size=args.encoder_output_size, # num_output_channels=nmix, # ).to(DEVICE) encoder_model = ConvVAE(info['CODE_LENGTH'], input_size=args.num_condition, encoder_output_size=args.encoder_output_size, num_output_channels=1).to(DEVICE) prior_model = PriorNetwork( size_training_set=info['NUM_TRAINING_EXAMPLES'], code_length=info['CODE_LENGTH'], n_mixtures=info['NUM_MIXTURES'], k=info['NUM_K'], require_unique_codes=info['REQUIRE_UNIQUE_CODES'], ).to(DEVICE) pcnn_decoder = GatedPixelCNN(input_dim=1, dim=info['NUM_PCNN_FILTERS'], n_layers=info['NUM_PCNN_LAYERS'], n_classes=info['num_actions'], float_condition_size=info['CODE_LENGTH'], last_layer_bias=0.5, hsize=info['hsize'], wsize=info['wsize']).to(DEVICE) parameters = list(encoder_model.parameters()) + list( prior_model.parameters()) + list(pcnn_decoder.parameters()) parameters = list(encoder_model.parameters()) + list( prior_model.parameters()) opt = optim.Adam(parameters, lr=info['MODEL_LEARNING_RATE']) if args.model_loadpath != '': print("loading weights from:%s" % args.model_loadpath) encoder_model.load_state_dict(model_dict['encoder_model_state_dict']) prior_model.load_state_dict(model_dict['prior_model_state_dict']) pcnn_decoder.load_state_dict(model_dict['pcnn_decoder_state_dict']) #encoder_model.embedding = model_dict['model_embedding'] opt.load_state_dict(model_dict['opt_state_dict']) model_dict = { 'encoder_model': encoder_model, 'prior_model': prior_model, 'pcnn_decoder': pcnn_decoder, 'opt': opt } data_buffers = {'train': train_buffer, 'valid': valid_buffer} if args.sample: sample_acn(info, model_dict, data_buffers, num_samples=args.num_samples, teacher_force=args.teacher_force) else: train_acn(info, model_dict, data_buffers)
class Agent: def __init__(self, env, env_w, device, config: Config): self.env = env self.env_w = env_w self.device = device self.cfg = config self.n_actions = config.n_actions self.policy_net = config.policy_net self.target_net = config.target_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = ReplayMemory(10000) self.steps_done = 0 self.episode_durations = [] def select_action(self, state): self.steps_done += 1 sample = random.random() eps_threshold = self.cfg.EPS_END + (self.cfg.EPS_START - self.cfg.EPS_END) * \ math.exp(-1. * self.steps_done / self.cfg.EPS_DECAY) if sample < eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. # action = self.policy_net(state).max(1)[1] action = self.policy_net(state).argmax() % self.n_actions else: action = random.randrange(self.n_actions) return torch.tensor([[action]], device=self.device, dtype=torch.long) def optimize_model(self): if len(self.memory) < self.cfg.BATCH_SIZE: return transitions = self.memory.sample(self.cfg.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.cfg.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.cfg.GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def step(self, i_episode): # Initialize the environment and state self.env.reset() last_screen = self.env_w.get_screen() current_screen = self.env_w.get_screen() state = current_screen - last_screen for t in count(): # Select and perform an action action = self.select_action(state) obs, reward, done, obs_ = self.env.step(action.item()) # reward = torch.tensor([reward], device=self.device) reward = torch.tensor([-abs(obs[2])], device=self.device, dtype=torch.float) # Observe new state last_screen = current_screen current_screen = self.env_w.get_screen() if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory self.memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) self.optimize_model() if done: self.episode_durations.append(t + 1) self.env_w.plot_durations(self.episode_durations) break # Update the target network, copying all weights and biases in DQN if i_episode % self.cfg.TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict())
memoryCapacity = 10000 numEpisodes = 10000 maxStepPerEpisode = 2000 learningRate = 0.03 # rendering render = True renderStepDuration = 50 renderEpisodeDuration = 20 # initialization policyNet = TetrisDQN() targetNet = TetrisDQN() policyNet.to(device) targetNet.to(device) memory = ReplayMemory(memoryCapacity) tetris = Tetris() optimizer = optim.RMSprop(policyNet.parameters(), lr=learningRate) numSteps = 0 bestEpisodeReward = -1000 done = True # save results currentTime = datetime.datetime.now() timeString = currentTime.strftime("%Y-%m-%d-%H-%M-%S") if not os.path.exists("results"): os.mkdir("results") directory = "results/" + timeString + "/" os.mkdir(directory) configFile = open(directory + "config.txt", "w")
# create environment env = Environment(rom_file=info['GAME'], frame_skip=info['FRAME_SKIP'], num_frames=info['HISTORY_SIZE'], no_op_start=info['MAX_NO_OP_FRAMES'], rand_seed=info['SEED'], dead_as_end=info['DEAD_AS_END'], max_episode_steps=info['MAX_EPISODE_STEPS']) # create replay buffer replay_memory = ReplayMemory( action_space=env.action_space, size=info['BUFFER_SIZE'], frame_height=info['OBS_SIZE'][0], frame_width=info['OBS_SIZE'][1], agent_history_length=info['HISTORY_SIZE'], batch_size=info['BATCH_SIZE'], num_heads=info['N_ENSEMBLE'], bernoulli_probability=info['BERNOULLI_PROBABILITY'], latent_frame_height=info['LATENT_SIZE'], latent_frame_width=info['LATENT_SIZE']) # latent_replay_memory = ReplayMemory(action_space=env.action_space, # size=info['BUFFER_SIZE'], # frame_height=info['LATENT_SIZE'], # frame_width=info['LATENT_SIZE'], # agent_history_length=info['HISTORY_SIZE'], # batch_size=info['BATCH_SIZE'], # num_heads=info['N_ENSEMBLE'], # bernoulli_probability=info['BERNOULLI_PROBABILITY']) random_state = np.random.RandomState(info["SEED"])
def train_dqn(env, num_steps, *, replay_size, batch_size, exploration, gamma, train_freq=1, print_freq=100, target_network_update_freq=500, t_learning_start=1000): """ DQN algorithm. Compared to previous training procedures, we will train for a given number of time-steps rather than a given number of episodes. The number of time-steps will be in the range of millions, which still results in many episodes being executed. Args: - env: The openai Gym environment - num_steps: Total number of steps to be used for training - replay_size: Maximum size of the ReplayMemory - batch_size: Number of experiences in a batch - exploration: a ExponentialSchedule - gamma: The discount factor Returns: (saved_models, returns) - saved_models: Dictionary whose values are trained DQN models - returns: Numpy array containing the return of each training episode - lengths: Numpy array containing the length of each training episode - losses: Numpy array containing the loss of each training batch """ # check that environment states are compatible with our DQN representation assert (isinstance(env.observation_space, gym.spaces.Box) and len(env.observation_space.shape) == 1) # get the state_size from the environment state_size = env.observation_space.shape[0] # initialize the DQN and DQN-target models dqn_model = DQN(state_size, env.action_space.n) dqn_target = DQN.custom_load(dqn_model.custom_dump()) # initialize the optimizer optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4) # initialize the replay memory memory = ReplayMemory(replay_size, state_size) # initiate lists to store returns, lengths and losses rewards = [] returns = [] lengths = [] losses = [] last_100_returns = deque(maxlen=100) last_100_lengths = deque(maxlen=100) # initiate structures to store the models at different stages of training saved_models = {} i_episode = 0 t_episode = 0 state = env.reset() # iterate for a total of `num_steps` steps for t_total in range(num_steps): # use t_total to indicate the time-step from the beginning of training if t_total >= t_learning_start: eps = exploration.value(t_total - t_learning_start) else: eps = 1.0 action = select_action_epsilon_greedy(dqn_model, state, eps, env) next_state, reward, done, _ = env.step(action) memory.add(state, action, reward, next_state, done) rewards.append(reward) state = next_state if t_total >= t_learning_start and t_total % train_freq == 0: batch = memory.sample(batch_size) loss = train_dqn_batch(optimizer, batch, dqn_model, dqn_target, gamma) losses.append(loss) # update target network if t_total >= t_learning_start and t_total % target_network_update_freq == 0: dqn_target.load_state_dict(dqn_model.state_dict()) if done: # Calculate episode returns G = 0 for i in range(len(rewards)): G += rewards[i] * pow(gamma, i) # Collect results lengths.append(t_episode + 1) returns.append(G) last_100_returns.append(G) last_100_lengths.append(t_episode + 1) if i_episode % print_freq == 0: logger.record_tabular("time step", t_total) logger.record_tabular("episodes", i_episode) logger.record_tabular("step", t_episode + 1) logger.record_tabular("return", G) logger.record_tabular("mean reward", np.mean(last_100_returns)) logger.record_tabular("mean length", np.mean(last_100_lengths)) logger.record_tabular("% time spent exploring", int(100 * eps)) logger.dump_tabular() # End of episode so reset time, reset rewards list t_episode = 0 rewards = [] # Environment terminated so reset it state = env.reset() # Increment the episode index i_episode += 1 else: t_episode += 1 return ( dqn_model, np.array(returns), np.array(lengths), np.array(losses), )
class DDQN_separated_net(Agent_segment): def __init__(self, epsilon=0.3, memory_size=300, batch_size=16, model=navigation_model, target_update_interval=1, tau=0.005): super(DDQN_separated_net, self).__init__(epsilon=epsilon, random_can_stop=False) # Memory self.memory = ReplayMemory(memory_size) # Batch size when learning self.batch_size = batch_size # number of time steps before an update of the delayed target Q network self.target_update_interval = target_update_interval # soft update weight of the delayed Q network self.tau = tau def learned_act(self, s, pred_oracle=True, online=False): if online: if pred_oracle: return torch.cat([self.model(s), oracle(s).unsqueeze(1)], 1) with torch.no_grad(): if pred_oracle: return torch.cat([self.target_model(s), oracle(s).unsqueeze(1)], 1) # to do without oracle def reinforce(self, s_, a_, n_s_, r_, game_over_, env_steps_): # Two steps: first memorize the states, second learn from the pool self.memory.remember(s_, a_, n_s_, r_, game_over_) transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) # non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False non_final_mask = torch.cat(batch.game_over) == False non_final_next_states = torch.cat(batch.next_state)[non_final_mask] state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action).view(-1, 2) reward_batch = torch.cat(batch.reward) # non_final_next_states = torch.cat(batch.next_state)[non_final_index] # print(state_batch.shape) state_values = self.learned_act(state_batch, online=True) state_action_values = torch.cat( [s[a[0].item(), a[1].item()].unsqueeze(0) for s, a in zip(state_values, batch.action)]) next_state_values = torch.zeros(self.batch_size, device=device) if len(non_final_next_states) > 0: with torch.no_grad(): argmax_online = (self.learned_act(non_final_next_states, online=True)).view(non_final_next_states.shape[0],-1).argmax(1) # print(torch.tensor(range(self.batch_size), device=device)[non_final_mask]) # print(self.learned_act(non_final_next_states, online=False).view(-1, 2*SEGMENT_LENGTH).shape) next_state_values[non_final_mask] = \ self.learned_act(non_final_next_states, online=False).view(non_final_next_states.shape[0], -1)[ range(len(non_final_next_states)), argmax_online] expected_state_action_values = next_state_values + reward_batch loss = F.smooth_l1_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask]) # .unsqueeze(1)) # loss = F.mse_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask]) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter param.grad.data.clamp_(-1e-6, 1e-6) self.optimizer.step() if env_steps_ % self.target_update_interval == 0: soft_update(self.target_model, self.model, self.tau) return float(loss) def save_model(self, model_path='model.pickle'): try: torch.save(self.model, model_path) except: pass def load_model(self, model_path='model.pickle', local=True): if local: self.model = navigation_model() self.target_model = navigation_model() hard_update(self.target_model, self.model) else: self.model = torch.load('model.pickle') self.target_model = torch.load('model.pickle') if torch.cuda.is_available(): print('Using GPU') self.model.cuda() self.target_model.cuda() else: print('Using CPU') self.optimizer = optim.RMSprop(self.model.parameters(), lr=1e-5)
def init_train(): train_data_file = args.train_buffer data_dir = os.path.split(train_data_file)[0] valid_data_file = args.valid_buffer #valid_data_file = '/usr/local/data/jhansen/planning/model_savedir/FRANKbootstrap_priorfreeway00/valid_set_small.npz' if args.model_loadpath == '': train_cnt = 0 run_num = 0 model_base_filedir = os.path.join(data_dir, args.savename + '%02d' % run_num) while os.path.exists(model_base_filedir): run_num += 1 model_base_filedir = os.path.join(data_dir, args.savename + '%02d' % run_num) os.makedirs(model_base_filedir) model_base_filepath = os.path.join(model_base_filedir, args.savename) print("MODEL BASE FILEPATH", model_base_filepath) info = { 'model_train_cnts': [], 'model_train_losses_list': [], 'model_valid_cnts': [], 'model_valid_losses_list': [], 'model_save_times': [], 'model_last_save': 0, 'model_last_plot': 0, 'NORM_BY': 255.0, 'model_model_loadpath': args.model_loadpath, 'MODEL_MODEL_BASE_FILEDIR': model_base_filedir, 'model_model_base_filepath': model_base_filepath, 'model_train_data_file': train_data_file, 'model_SAVENAME': args.savename, 'DEVICE': DEVICE, 'NUM_Z': args.num_z, 'NUM_K': args.num_k, 'NR_LOGISTIC_MIX': args.nr_logistic_mix, 'BETA': args.beta, 'ALPHA_REC': args.alpha_rec, 'ALPHA_ACT': args.alpha_act, 'ALPHA_REW': args.alpha_rew, 'MODEL_BATCH_SIZE': args.batch_size, 'NUMBER_CONDITION': args.number_condition, 'MODEL_LEARNING_RATE': args.learning_rate, 'MODEL_SAVE_EVERY': args.save_every, } ## size of latents flattened - dependent on architecture of vqvae #info['float_condition_size'] = 100*args.num_z ## 3x logistic needed for loss ## TODO - change loss else: print('loading model from: %s' % args.model_loadpath) model_dict = torch.load(args.model_loadpath) info = model_dict['model_info'] model_base_filedir = os.path.split(args.model_loadpath)[0] model_base_filepath = os.path.join(model_base_filedir, args.savename) train_cnt = info['model_train_cnts'][-1] info['loaded_from'] = args.model_loadpath info['MODEL_BATCH_SIZE'] = args.batch_size #if 'reward_weights' not in info.keys(): # info['reward_weights'] = [1,100] # create replay buffer train_buffer = ReplayMemory(load_file=train_data_file) valid_buffer = ReplayMemory(load_file=valid_data_file) info['num_actions'] = train_buffer.num_actions() info['size_training_set'] = train_buffer.num_examples() info['hsize'] = train_buffer.frame_height info['wsize'] = train_buffer.frame_width info['num_rewards'] = train_buffer.num_rewards() rewards_weight = 1 - np.array(train_buffer.percentages_rewards()) actions_weight = 1 - np.array(train_buffer.percentages_actions()) actions_weight = torch.FloatTensor(actions_weight).to(DEVICE) rewards_weight = torch.FloatTensor(rewards_weight).to(DEVICE) info['actions_weight'] = actions_weight info['rewards_weight'] = rewards_weight # output mixtures should be 2*nr_logistic_mix + nr_logistic mix for each # decorelated channel info['HISTORY_SIZE'] = 4 info['num_output_mixtures'] = (2 * args.nr_logistic_mix + args.nr_logistic_mix) * info['HISTORY_SIZE'] nmix = int(info['num_output_mixtures'] / info['HISTORY_SIZE']) info['nmix'] = nmix vqvae_model = VQVAErl( num_clusters=info['NUM_K'], encoder_output_size=info['NUM_Z'], num_output_mixtures=info['num_output_mixtures'], in_channels_size=info['NUMBER_CONDITION'], n_actions=info['num_actions'], int_reward=info['num_rewards'], ).to(DEVICE) print('using args', args) parameters = list(vqvae_model.parameters()) opt = optim.Adam(parameters, lr=info['MODEL_LEARNING_RATE']) if args.model_loadpath != '': print("loading weights from:%s" % args.model_loadpath) vqvae_model.load_state_dict(model_dict['modelvae_state_dict']) opt.load_state_dict(model_dict['model_optimizer']) vqvae_model.embedding = model_dict['model_embedding'] #args.pred_output_size = 1*80*80 ## 10 is result of structure of network #args.z_input_size = 10*10*args.num_z #train_cnt = train_vqvae(train_cnt, vqvae_model, opt, info, train_data_loader, valid_data_loader) run_vqvae(info, vqvae_model, opt, train_buffer, valid_buffer, num_samples_to_train=args.num_samples_to_train, save_every_samples=args.save_every)
# Create default Malmo objects: agent_host = MalmoPython.AgentHost() try: agent_host.parse(sys.argv) except RuntimeError as e: print('ERROR:', e) print(agent_host.getUsage()) exit(1) if agent_host.receivedArgument("help"): print(agent_host.getUsage()) exit(0) net = cnn.Net(farm_size, 4) memory = ReplayMemory(1000) for episode in range(5): my_mission = MalmoPython.MissionSpec(missionXML, True) my_mission_record = MalmoPython.MissionRecordSpec() # Attempt to start a mission: max_retries = 3 for retry in range(max_retries): try: agent_host.startMission(my_mission, my_mission_record) break except RuntimeError as e: if retry == max_retries - 1: print("Error starting mission:", e) exit(1)
env = Environment(rom_file=info['GAME'], frame_skip=info['FRAME_SKIP'], num_frames=info['HISTORY_SIZE'], no_op_start=info['MAX_NO_OP_FRAMES'], rand_seed=info['SEED'], dead_as_end=info['DEAD_AS_END'], max_episode_steps=info['MAX_EPISODE_STEPS']) # create replay buffer if info['REPLAY_MEMORY_LOADPATH'] == 0: replay_memory = ReplayMemory( action_space=env.action_space, size=info['BUFFER_SIZE'], frame_height=info['OBS_SIZE'][0], frame_width=info['OBS_SIZE'][1], agent_history_length=info['HISTORY_SIZE'], batch_size=info['BATCH_SIZE'], num_heads=info['N_ENSEMBLE'], bernoulli_probability=info['BERNOULLI_PROBABILITY'], #latent_frame_height=info['LATENT_SIZE'], #latent_frame_width=info['LATENT_SIZE']) ) else: replay_memory = ReplayMemory(load_file=info['REPLAY_MEMORY_LOADPATH']) valid_replay_memory = ReplayMemory( load_file=info['REPLAY_MEMORY_VALID_LOADPATH']) start_step_number = replay_memory.count random_state = np.random.RandomState(info["SEED"]) if args.model_loadpath != '': # load data from loadpath - save model load for later. we need some of
lambda: GCNBoard(env.n_resources + 2, 8, env.n_resources, env.n_nodes, 0.2), representation, env, 'board') trainer_node = [] for i in range(env.adj.shape[0]): trainer_node.append( Trainer( lambda: GCNNode(env.n_resources + 2, 8, env.degree[i], env. n_nodes, 0.2, 'node' + str(i)), representation, env, 'node')) trainer_score = ScorePredictionTrainer( lambda: ScorePredictionFunc(env.n_resources + 2, 8, env.n_nodes, 0.2), representation, env) mem_scores = ReplayMemory(500, { "sts": [env.adj.shape[0], env.n_resources], "features": [env.features.shape[0], env.features.shape[1]], "scores": [] }, batch_size=20) mem_board = ReplayMemory( 100, { "sts": [env.adj.shape[0], env.n_resources], "features": [env.features.shape[0], env.features.shape[1]], "pi": [env.n_resources + 1], "return": [] }) mem_node = [] for i in range(env.adj.shape[0]): mem_node.append( ReplayMemory( 100, { "sts": [env.adj.shape[0], env.n_resources],
def setup(data_dir, savename, train_data_file, model_loadpath=''): data_dir = os.path.split(train_data_file)[0] train_buffer = ReplayMemory(load_file=train_data_file) if args.model_loadpath == '': train_cnt = 0 run_num = 0 model_base_filedir = os.path.join(data_dir, savename + '%02d' % run_num) while os.path.exists(model_base_filedir): run_num += 1 model_base_filedir = os.path.join(data_dir, savename + '%02d' % run_num) os.makedirs(model_base_filedir) model_base_filepath = os.path.join(model_base_filedir, savename) print("MODEL BASE FILEPATH", model_base_filepath) info = { 'train_cnts': [], 'train_losses_list': [], 'valid_cnts': [], 'valid_losses_list': [], 'save_times': [], 'savename': savename, 'data_dir': datadir, #'args':[args], 'last_save': 0, 'last_plot': 0, 'reward_weights': [1, 100], # should be same as num_rewards } else: print('loading model from: %s' % model_loadpath) model_dict = torch.load(model_loadpath) info = model_dict['info'] model_base_filedir = os.path.split(model_loadpath)[0] model_base_filepath = os.path.join(model_base_filedir, args.savename) train_cnt = info['train_cnts'][-1] info['loaded_from'] = model_loadpath if 'reward_weights' not in info.keys(): info['reward_weights'] = [1, 100] num_actions = info['num_actions'] = n_actions num_rewards = info['num_rewards'] = len(train_data_loader.unique_rewards) args.size_training_set = train_data_loader.num_examples hsize = train_data_loader.data_h wsize = train_data_loader.data_w info['num_rewards'] = len(train_data_loader.unique_rewards) info['hsize'] = hsize info['num_channels'] = num_actions + 1 + 1 # !!!! TODO save this in npz and pull out #num_k = info['num_k'] = 512 ###########################################3 # load vq model vq_model_dict = torch.load(args.vq_model_loadpath, map_location=lambda storage, loc: storage) vq_info = vq_model_dict['info'] vq_largs = vq_info['args'][-1] nmix = int(vq_info['num_output_mixtures'] / 2) ###########################################3 num_k = vq_largs.num_k vqvae_model = VQVAErl(num_clusters=num_k, encoder_output_size=vq_largs.num_z, num_output_mixtures=vq_info['num_output_mixtures'], in_channels_size=vq_largs.number_condition, n_actions=vq_info['num_actions'], int_reward=vq_info['num_rewards']).to(DEVICE) vqvae_model.load_state_dict(vq_model_dict['vqvae_state_dict']) vqvae_model.eval() #conv_forward_model = ForwardResNet(BasicBlock, data_width=info['hsize'], # num_channels=info['num_channels'], # num_actions=num_actions, # num_output_channels=num_k, # num_rewards=num_rewards, # dropout_prob=args.dropout_prob).to(DEVICE) conv_forward_model = ForwardResNet( BasicBlock, data_width=info['hsize'], num_channels=info['num_channels'], num_output_channels=num_k, dropout_prob=args.dropout_prob).to(DEVICE) # reweight the data based on its frequency info['actions_weight'] = 1 - np.array( train_data_loader.percentages_actions) info['rewards_weight'] = 1 - np.array( train_data_loader.percentages_rewards) actions_weight = torch.FloatTensor(info['actions_weight']).to(DEVICE) rewards_weight = torch.FloatTensor(info['rewards_weight']).to(DEVICE) parameters = list(conv_forward_model.parameters()) opt = optim.Adam(parameters, lr=args.learning_rate) if args.model_loadpath != '': conv_forward_model.load_state_dict(model_dict['conv_forward_model']) opt.load_state_dict(model_dict['optimizer']) #args.pred_output_size = 1*80*80 ## 10 is result of structure of network #args.z_input_size = 10*10*args.num_z train_cnt = train_forward(train_cnt)
def train_gen_pg_each(generator, agent, discriminator, epoch, trainSample, subnum, optimizer_agent, optimizer_usr, batch_size, embed_dim, recom_length, max_length, real_label_num, device, gen_ratio, pretrain = False, shuffle_index=None): generator.train() agent.train() print('\nTRAINING : Epoch ' + str(epoch)) generator.train() all_costs = [] logs = [] decay = 0.95 gamma = 0.9 max_norm=5 all_num=0 last_time = time.time() #Adjust the learning rate if epoch>1: optimizer_agent.param_groups[0]['lr'] = optimizer_agent.param_groups[0]['lr'] * decay optimizer_usr.param_groups[0]['lr'] = optimizer_usr.param_groups[0]['lr'] * decay print('Learning rate_agent : {0}'.format(optimizer_agent.param_groups[0]['lr'])) print('Learning rate_usr : {0}'.format(optimizer_usr.param_groups[0]['lr'])) #Generate subsamples trainSample_sub = Sample() trainSample_sub.subSample_copy(subnum, trainSample, shuffle_index) for stidx in range(0, trainSample_sub.length(), batch_size): # prepare batch embed_batch, length, _, reward_batch, action_batch = getBatch_dis(stidx, stidx + batch_size, trainSample_sub, embed_dim, recom_length) embed_batch, reward_batch, action_batch = Variable(embed_batch.to(device)), Variable(reward_batch.to(device)), Variable(action_batch.to(device)) k = embed_batch.size(0) #Actual batch size replay = ReplayMemory(generator, agent, int((1+gen_ratio)*k), max_length, real_label_num, action_batch.size(1)) replay.init_click((embed_batch, length), reward_batch, action_batch) replay.gen_sample(batch_size, True, discriminator) tgt_reward, gen_reward, usr_prob, agent_prob = replay.tgt_rewards.type(torch.FloatTensor).to(device), replay.gen_rewards.type(torch.FloatTensor).to(device), replay.usr_probs.to(device), replay.agent_probs.to(device) tgt_prob = torch.abs(1.0-torch.round(tgt_reward)-tgt_reward) tgt_reward = torch.round(tgt_reward) if not pretrain: loss_usr = -((torch.log(usr_prob + 1e-12) + torch.log(tgt_prob + 1e-12)) * gen_reward).sum()/k #Calculate the cumulative reward tgt_reward = gen_reward * (1 + tgt_reward) tgt_value = generator.value(tgt_reward) #loss_agent = -(torch.log(agent_prob + 1e-12) * (gen_reward + tgt_value)).sum()/k #+ 1e-18 loss_agent = -(torch.log(agent_prob + 1e-12) * (tgt_value)).sum()/k #+ 1e-18 all_costs.append(loss_agent.data.cpu().numpy()) # backward optimizer_agent.zero_grad() optimizer_usr.zero_grad() if not pretrain: loss_usr.backward(retain_graph=True) #Print gradients for each layer ''' print("Gradients for user behavior models:") print("Embedding:") generator.embedding.print_grad() print("Encoder:") generator.encoder.print_grad() print("MLPlayer:") print(generator.enc2out.weight.grad) ''' #Gradient clipping clip_grad_value_(filter(lambda p: p.requires_grad, generator.parameters()), 1) #clip_grad_norm_(filter(lambda p: p.requires_grad, generator.parameters()), 5) optimizer_usr.step() loss_agent.backward() #Gradient clipping clip_grad_value_(filter(lambda p: p.requires_grad, agent.parameters()), 1) #clip_grad_norm_(filter(lambda p: p.requires_grad, agent.parameters()), 5) # optimizer step optimizer_agent.step() # Printing if len(all_costs) == 100: logs.append( '{0} ; loss {1} ; seq/s {2}'.format(stidx, round(np.mean(all_costs),2), int(len(all_costs) * batch_size / (time.time() - last_time)))) print(logs[-1]) last_time = time.time() all_costs = [] return all_costs
info['num_rewards'] = len(info['REWARD_SPACE']) # create environment env = Environment(rom_file=info['GAME'], frame_skip=info['FRAME_SKIP'], num_frames=info['HISTORY_SIZE'], no_op_start=info['MAX_NO_OP_FRAMES'], rand_seed=info['SEED'], dead_as_end=info['DEAD_AS_END'], max_episode_steps=info['MAX_EPISODE_STEPS']) # create replay buffer replay_memory = ReplayMemory( size=info['BUFFER_SIZE'], frame_height=info['OBS_SIZE'][0], frame_width=info['OBS_SIZE'][1], agent_history_length=info['HISTORY_SIZE'], batch_size=info['BATCH_SIZE'], num_heads=info['N_ENSEMBLE'], bernoulli_probability=info['BERNOULLI_PROBABILITY']) latent_replay_memory = ReplayMemory( size=info['BUFFER_SIZE'], frame_height=info['LATENT_SIZE'], frame_width=info['LATENT_SIZE'], agent_history_length=info['HISTORY_SIZE'], batch_size=info['BATCH_SIZE'], num_heads=info['N_ENSEMBLE'], bernoulli_probability=info['BERNOULLI_PROBABILITY']) random_state = np.random.RandomState(info["SEED"]) if args.model_loadpath != '':
def make_random_subset_buffers(dataset_path, buffer_path, train_max_examples=100000, kernel_size=(2, 2), trim_before=0, trim_after=0): sys.path.append('../agents') from replay import ReplayMemory # keep max_examples < 100000 to enable knn search # states [top of image:bottom of image,:] # in breakout - can safely reduce size to be 40x40 of the given image # try to get an even number of each type of reward if not os.path.exists(dataset_path): os.makedirs(dataset_path) buffer_name = os.path.split(buffer_path)[1] buffers = {} paths = {} for phase in ['valid', 'train']: if phase == 'valid': max_examples = int(0.15 * train_max_examples) else: max_examples = train_max_examples small_name = buffer_name.replace( '.npz', '_random_subset_%06d_%sx%stb%sta%s_%s.npz' % (max_examples, kernel_size[0], kernel_size[1], trim_before, trim_after, phase)) small_path = os.path.join(dataset_path, small_name) paths[phase] = small_path if os.path.exists(small_path): print('loading small buffer path') print(small_path) sbuffer = ReplayMemory(load_file=small_path) sbuffer.init_unique() buffers[phase] = sbuffer # if we dont have both train and valid - make completely new train/valid set if not len(buffers.keys()) == 2: print('creating new train/valid buffers') load_buffer = ReplayMemory(load_file=buffer_path) orig_states = [] small_states = [] for index in range(10, 400): if load_buffer.is_valid_index(index): s, _ = load_buffer._get_state(index) orig_states.append(s[-1]) small_states.append( load_buffer.online_shrink_frame_size( s[-1], trim_before, kernel_size, trim_after)) bdir = small_path.replace('.npz', '') if not os.path.exists(bdir): os.makedirs(bdir) image_path = os.path.join(bdir, 'step_%03d.png') movie_path = os.path.join(bdir, 'movie.mp4') for index in range(len(orig_states)): f, ax = plt.subplots(1, 2) ax[0].matshow(orig_states[index]) ax[1].matshow(small_states[index]) plt.savefig(image_path % index) plt.close() cmd = "ffmpeg -n -r 10 -i %s -c:v libx264 -pix_fmt yuv420p %s" % ( os.path.abspath(image_path), os.path.abspath(movie_path)) print(cmd) os.system(cmd) if max(list(kernel_size) + [trim_before, trim_after]) > 1: load_buffer.shrink_frame_size(kernel_size=kernel_size, reduction_function=np.max, trim_before=trim_before, trim_after=trim_after) #for r in range(states.shape[0]): # imwrite('mp%s.png'%r, states[r,-1]) load_buffer.reset_unique() # history_length + 1 for every random example frame_multiplier = (load_buffer.agent_history_length + 1) #frame_multiplier = 2 total_frames_needed = int((max_examples * 1.15) * frame_multiplier) + 1 # not sure why we weren't allowing overlapping frames #total_frames_needed = int((max_examples*1.15)) if load_buffer.count < total_frames_needed % load_buffer.size: raise ValueError( 'load buffer is not large enough (%s) to collect number of examples (%s)' % (load_buffer.count, total_frames_needed)) print('loading prescribed buffer path.... this may take a while') print(buffer_path) for phase in ['valid', 'train']: if phase == 'valid': max_examples = int(0.15 * train_max_examples) else: max_examples = train_max_examples print('creating small %s buffer with %s examples' % (phase, max_examples)) # actions for breakout: # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] frames_needed = max_examples * frame_multiplier sbuffer = ReplayMemory( frames_needed, frame_height=load_buffer.frame_height, frame_width=load_buffer.frame_width, agent_history_length=load_buffer.agent_history_length) num_examples = 0 while num_examples < max_examples: batch = load_buffer.get_unique_minibatch(1) states, actions, rewards, next_states, real_terminal_flags, _, unique_indices, index_indices = batch bs, num_hist, h, w = states.shape # action is the action that was used to get from state to next state # t-3, t-2, t-1, t-1, t # s-4, s-3, s-2, s-1 # s-3, s-2, s-1, s past_indices = np.arange(unique_indices - (num_hist), unique_indices + 1) for batch_idx in range(bs): # get t-4 thru t=0 # size is bs,5,h,w all_states = np.hstack((states[:, 0:1], next_states)) for ss in range(num_hist + 1): # only use batch size 1 in minibatch # frame is "next state" in replay buffer frame = all_states[batch_idx, ss] action = load_buffer.actions[past_indices[ss]] reward = load_buffer.rewards[past_indices[ss]] if ss == num_hist: # this is the observed state and the only one we will # use a true action/reward for #action = actions[batch_idx] #reward = rewards[batch_idx] terminal_flag = True end_flag = True num_examples += 1 if not num_examples % 5000: print('added %s examples to %s buffer' % (num_examples, phase)) else: # use this to debug and assert that all actions/rewards # in sampled minibatch of sbuffer are < 99 terminal_flag = False end_flag = False sbuffer.add_experience(action, frame, reward, terminal_flag, end_flag) sbuffer.rewards = sbuffer.rewards.astype(np.int32) sbuffer.init_unique() sbuffer.save_buffer(paths[phase]) buffers[phase] = sbuffer return buffers, paths