def train(self): for t in range(self.num_iterations): actions, returns, losses = self.sample_model_reward_return(t) self.sess.run(self.train_op, feed_dict={ self.action_placeholder : actions, self.advantage_placeholder : returns-losses}) #avg_acc = np.mean(returns) avg_acc = (np.mean(returns)*data_std + data_mean) / 100000. #calculate number of used models: used = 0 for key in self._used_dict.keys(): used += self._used_dict[key] #used = np.sum(self._used_dict) self._num_used_models.append(used) self.log_acc.append(avg_acc) #sigma_reward = np.sqrt(np.var(returns) / len(total_rewards)) msg = "Average accuracy within a batch: {:04.2f}".format(avg_acc*100) self.logger.info(msg) #print (actions) self.logger.info("- Training done.") #export_plot(self.log_acc, "Batch_Accuracy", 'NAS-DNN', "./batch_accuracy.png", self._num_used_models, "Sampled Model") export_plot(self.log_acc, "Score", 'NAS-DNN', "./batch_accuracy.png") export_plot(self._num_used_models, "Models Sampled", 'NAS-DNN', "./used_models.png") print 'log_acc'; print self.log_acc print '_num_used_models'; print self._num_used_models
def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() scores_eval = [] # list of scores computed at iteration time for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards = self.sample_path(self.env) scores_eval = scores_eval + total_rewards observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time # step returns = self.get_returns(paths) advantages = self.calculate_advantage(returns, observations) # run training operations if self.config.use_baseline: self.update_baseline(returns, observations) self.sess.run(self.train_op, feed_dict={ self.observation_placeholder: observations, self.action_placeholder: actions, self.advantage_placeholder: advantages}) # tf stuff if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, scores_eval) self.record_summary(t) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) if self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.logger.info("- Training done.") export_plot( scores_eval, "Score", config.env_name, self.config.plot_output)
def train(self, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time prog = Progbar(target=self.config.nsteps_train) rcopy = RepeatCopy(num_bits=self.config.num_bits, batch_size=self.config.batch_size, min_length=self.config.min_length, max_length=self.config.max_length, min_repeats=self.config.min_repeats, max_repeats=self.config.max_repeats) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 config = self.config batch_data = rcopy() # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % config.log_freq == 0) and (t % config.learning_freq == 0)): self.update_averages(scores_eval) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= config.nsteps_train: break if last_eval >= config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self): self.baseline = -1000.0 for t in range(self.num_iterations): #print ('iterations:', t) actions, con_actions, returns, losses = self.sample_model_reward_return( t) returns = returns * 2 #self.baseline = (t*self.baseline + np.mean(returns)) / (t+1) if self.baseline == -1000.0: self.baseline = np.mean(returns) else: self.baseline = 0.6 * self.baseline + 0.4 * np.mean(returns) self.sess.run(self.train_op, feed_dict={ self.action_placeholder: actions, self.con_action_placeholder: con_actions, self.advantage_placeholder: returns - self.baseline }) avg_acc = np.mean(returns) used = len(self._used_models) self._num_used_models.append(used) self.log_acc.append(avg_acc) #sigma_reward = np.sqrt(np.var(returns) / len(total_rewards)) msg = "Average accuracy within a batch: {:04.2f}".format(avg_acc) self.logger.info(msg) #print (actions) self.logger.info("- Training done.") export_plot(self.log_acc, "Score", 'NAS-CNN', "./batch_accuracy.png") export_plot(self._num_used_models, "Number of distinct models sampled", 'NAS-CNN', "./used_models.png")
def train(self): for t in range(self.num_batches): actions, returns = self.sample_model_reward_return() #self.baseline = (t*self.baseline + np.mean(returns)) / (t+1) if self.baseline == -1000.0: self.baseline = np.mean(returns) else: self.baseline = 0.6 * self.baseline + 0.4 * np.mean(returns) self.sess.run(self.train_op, feed_dict={ self.action_placeholder: actions, self.advantage_placeholder: returns }) # not using baseline here avg_acc = np.mean(returns) #calculate number of used models: used = 0 #for key in self._used_dict.keys(): #used += self._used_dict[key] used = np.sum(self._used_dict) self._num_used_models.append(used) self.log_acc.append(avg_acc) #sigma_reward = np.sqrt(np.var(returns) / len(total_rewards)) msg = "Average accuracy within a batch: {:04.2f}".format(avg_acc) self.logger.info(msg) print(actions) self.logger.info("- Training done.") #export_plot(self.log_acc, "Batch_Accuracy", 'NAS-DNN', "./batch_accuracy.png", self._num_used_models, "Sampled Model") export_plot(self.log_acc, "Score", 'NAS-DNN', "./batch_accuracy.png") export_plot(self._num_used_models, "Models Sampled", 'NAS-DNN', "./used_models.png")
def train(self, model_a, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train orientation_map = [ np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0]) ] # interact with environment while t < self.config.nsteps_train: total_reward = 0 flag = True while flag: state = self.env.reset() # h x w x c agent_location = self.env.state.agent_location if self.env.teacher.dist_map[agent_location[1], agent_location[0]] != np.inf: flag = False model_a.env.reset() model_a.env.state.copy_state(model_a.env.agent, self.env.state) h_state = (np.zeros([1, self.config.h_size]), np.zeros([1, self.config.h_size])) h_state_a = (np.zeros([1, model_a.config.h_size]), np.zeros([1, model_a.config.h_size])) slen = np.ones(1).astype('int32') action = 0 for i in range(200): t += 1 last_eval += 1 last_record += 1 raw_goal_state, goal_state = self.convert_state_to_goal_state( state) #### for replay_buffer # replay memory stuff idx = replay_buffer.store_frame(raw_goal_state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values, h_state = self.get_best_action( [q_input], h_state, slen, [action]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) reward = 0 #### perform action in env #### #### update goal obs image #### if action == 1: if self.config.render_train: self.env.teacher.update_goal_obs_image(self.env.state) if self.config.render_train: self.env.render() #### teacher move #### agent_location = self.env.state.agent_location agent_orientation = self.env.state.agent_orientation goal_location = agent_location + agent_orientation gt_action = self.env.teacher.action_map[agent_location[1], agent_location[0]] if np.dot(agent_orientation, orientation_map[gt_action]) == 1: new_state, reward_i, done = self.env.step(0) else: tmp = np.cross(agent_orientation, orientation_map[gt_action]) if tmp == 1: new_state, reward_i, done = self.env.step(3) else: new_state, reward_i, done = self.env.step(2) #### issue command #### if action == 1: model_a.env.teacher.set_goal(goal_state, goal_location) reward_a = model_a.navi_goal(h_state_a, goal_state) if model_a.env.teacher.goal_finish: reward += reward_i reward += reward_a reward += -1 self.env.state.teleport( self.env.agent, model_a.env.state.agent_location, model_a.env.state.agent_orientation) new_state = self.env.state.onehot_state # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate(model_a)] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(model_a)] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule, exp_schedule1, env=None): """ Performs training of Q only on agent 0 Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ if env is None: env = self.env # initialize replay buffer and variables rewards = deque(maxlen=self.config.num_episodes_test) rewardsB = deque(maxlen=self.config.num_episodes_test) self.model_0.rewards = rewards self.model_1.rewards = rewardsB # self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.model_0.train_init() self.model_1.train_init() # next_fire_B = False # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() # need_new_ball = False while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: env.render() action_0 = self.model_0.train_step_pre(state, exp_schedule) # if exp_schedule.epsilon == 1: # action_1 = exp_schedule.get_action(0,3) # agent altogether # else: action_1 = self.model_1.train_step_pre(state[:, ::-1], exp_schedule1) cur_action = actions.trans(action_0, action_1) # perform action in env new_state, reward, done, info = env.step(cur_action) # print("Reward", reward) # Problem loss_e0, grad_e0 = self.model_0.train_step_post( reward, done, t, lr_schedule, True) self.model_1.train_step_post(-reward, done, t, lr_schedule, False) state = new_state # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): # self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[ ("Loss", loss_e0), ("Avg R", np.mean(rewards)), ("Max R", np.max(rewards)), ("Min R", np.min(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_e0), ("Max Q", np.mean(self.model_0.max_q_values)), ("lr", lr_schedule.epsilon) ]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) rewardsB.append(-total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record(exp_schedule) self.model_0.save(t) # save the models self.model_1.save(t) # save the models # last words self.logger.info("- Training done.") self.model_0.save() # save the models self.model_1.save() # save the models scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) last_frames = deque(maxlen=4) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [] extractor = PongExtractor() prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() last_frame = state last_frames.append(state) while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() feats = extractor.extract(np.squeeze(state)) # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) embedding = self.sess.run(self.hidden, feed_dict={self.s: [q_input]})[0] action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) if t % 100 == 0: # print state.shape # frame = np.zeros(np.squeeze(state).shape) # for f in last_frames: # frame = frame + np.squeeze(f) # frame = frame / len(last_frames) frame = np.squeeze(state) last_frame = np.squeeze(last_frame) pickle.dump( last_frames, open('frames/embedding/atari{}.p'.format(t), 'w')) for i in range(4): f = np.squeeze(last_frames[i]) scipy.misc.imsave( 'frames/embedding/atari{}.png'.format(t - 3 + i), f) # scipy.misc.imsave('frames/atari{}.png'.format(t-1),last_frame) # posfile = open('frames/atari{}.txt'.format(t),'w') # posfile.write('Opp Paddle:\t{}\n'.format(oppY)) # posfile.write('Player Paddle:\t{}\n'.format(playerY)) # posfile.write('ball x:\t{}\n'.format(ballX)) # posfile.write('ball y:\t{}\n'.format(ballY)) # posfile.close() np.savetxt('frames/embedding/pong{}.txt'.format(t), feats, fmt='%.2f') # perform action in env new_state, reward, done, info = self.env.step(action) # print "state shape:",state.shape() # store the transition replay_buffer.store_effect(idx, action, reward, done) last_frame = state state = new_state last_frames.append(state) # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, model_i, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 encoding_batch = [] predflag_batch = [] target_action_batch = [] slen_batch = [] max_len = 0 for i in range(self.config.batch_size): #config = self.config #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx] #self.env.reset(config) # h x w x c encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway) encoding_batch.append(encoding[None]) predflag_batch.append(predflag[None]) target_action_batch.append(target_action[None]) slen_batch.append(encoding.shape[0]) if encoding.shape[0]>max_len: max_len = encoding.shape[0] batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0), np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0), np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32')) # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(scores_eval) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= self.config.nsteps_train: break if last_eval >= self.config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d"%(t)) scores_eval += [self.evaluate(model_i)] ''' if scores_eval[-1]>0.8: curri_idx += 1 msg = "Upgrade to lesson {:d}".format(int(curri_idx)) self.logger.info(msg) self.logger.info("----------Start Computing Final Score----------") scores_eval += [self.evaluate(model_i)] self.logger.info("----------Finish Computing Final Score----------") ''' # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(cr_schedule)] export_plot(scores_eval, "Scores", self.config.plot_output)
def export_score(self, scores_eval): export_plot( scores_eval, "Scores", self.config.plot_output + "scores_" + str(self.index) + ".png")
def train(self, model_a, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train model_a.env.state.is_render_image = model_a.config.render_train orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])] npath = self.config.npath # paths to generate in each environment nquery = self.config.nquery # query to generate in each environment max_plan_len = self.config.max_plan_len ndigits = self.config.ndigits nway = self.config.nway num_classes = len(self.env.state.xmap.item_class_id) # three steps: # 1. sample paths from the teacher environment and pass to dnc # 2. get immediate reward from whether agent could reach the subgoal # 3. sample query paths and ask agent to follow the plan, get the final big reward # -- train one step after each teacher's move # interact with environment while t < self.config.nsteps_train: total_reward = 0 self.env.reset() model_a.env.reset() model_a.env.state.copy_state(model_a.env.agent, self.env.state) dnc_state = DNC.zero_state(self.config, batch_size=1) h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size])) slen = np.ones(1).astype('int32') action = 0 # sample paths for i in range(npath): state_seq, path_loc, path_ori = self.env.teacher.gen_sample_seq(self.env.state) state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway) goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool') #### missing could be everything #### goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq #### treat missing observation as correct observation #### goal_state_seq[:,:,:,:,num_classes] = True #### transpose goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1]) path_len = state_seq.shape[0] mask_seq = np.logical_not(state_seq[:,:3,:,num_classes]) flag_seq = np.zeros([path_len]) flag_seq[-1] = 1 model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]]) for j in range(path_len): # get agate from dnc cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0) agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state}) agate_dnc_val = agate_dnc_val[0,0] # get q value and sample action idx = replay_buffer.store_frame(state_seq[j]) q_input = replay_buffer.encode_recent_observation() best_action, q_values, h_state = self.get_best_action([q_input], h_state, slen, [action], [agate_dnc_val]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # take action and update dnc cur_dnc_in[-2] = action dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state}) # acquire reward reward = 0 done = False if action==1: h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) model_a.env.teacher.set_goal(goal_state_seq[j], path_loc[j]) reward_a = model_a.navi_goal(h_state_a, goal_state_seq[j]) if not model_a.env.teacher.goal_finish: reward += -0.05 reward += -0.05 model_a.env.state.teleport(model_a.env.agent, path_loc[j], orientation_map[path_ori[j]]) # acquire final reward if i==npath-1 and j==path_len-1: done = True reward_list = list() for k in range(nquery): reward_list.append(0) src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state) src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway) tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway) path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len}) path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2) target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2) path_dnc_val_inner = np.argmax(path_dnc_val, axis=3) target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2) cur_len = max_plan_len for l in range(max_plan_len): if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all(): cur_len = l+1 break path_dnc_val = path_dnc_val[:cur_len] path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0) #### modify goal state #### #### missing could be everything #### path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val #### treat missing observation as correct observation #### path_dnc_val[:,:,:,num_classes] = True model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1])) h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) for l in range(path_dnc_val.shape[0]): cur_goal_state = path_dnc_val[l] cur_goal_state = np.expand_dims(cur_goal_state, 3) cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1), np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3) model_a.env.teacher.set_goal(cur_goal_state, tgt_loc) reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state) if model_a.env.teacher.goal_finish: reward_list[-1] += 10 reward += sum(reward_list)/len(reward_list) # store everything into replay buffer replay_buffer.store_effect(idx, action, agate_dnc_val, reward, done) t += 1 last_eval += 1 last_record += 1 # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d"%(t)) scores_eval += [self.evaluate(model_a)] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(model_a)] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() scores_eval = [] # list of scores computed at iteration time previous_avg_reward = -np.Inf for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards = self.sample_path(self.env) scores_eval = scores_eval + total_rewards observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time step returns = self.get_returns(paths) advantages = self.calculate_advantage(returns, observations) # run training operations if self.config.use_baseline: self.update_baseline(returns, observations) if self.config.use_sgd is True: print("USE SGD") feed_dict = { self.observation_placeholder: observations, self.action_placeholder: actions, self.sgd_lr_placeholder: self.sgd_lr, self.advantage_placeholder: advantages } old_params = self.sess.run(self.get_pi_params) # 1 time is enough even if policy is stochastic: checks ok ... results are very consistent old_actions = self.sess.run( self.sampled_action, feed_dict={self.observation_placeholder: observations}) start = timer() old_penalty = env.penalty(observations, old_actions) end = timer() # Takes 17 seconds on my setup print("...Time to compute penalty: {}".format(end - start)) print("...len: {} {} {}".format(len(old_params), len(observations), len(old_actions))) print("...old_penalty {}".format(old_penalty)) sgd_gradient = self.sess.run( self.gradient, feed_dict) # !!! DO NOT SWAP LINES !!! self.sess.run(self.sgd_train_op, feed_dict) new_actions = self.sess.run( self.sampled_action, feed_dict={self.observation_placeholder: observations}) sgd_penalty = env.penalty(observations, new_actions) print("...sgd_penalty {}".format(sgd_penalty)) sgd_params = self.sess.run(self.get_pi_params) xxx_params = old_params - self.sgd_lr * sgd_gradient print(np.linalg.norm(xxx_params - sgd_params, ord=2)) #assert 2==1 # checks ongoing if sgd_penalty > old_penalty: start = timer() backtrack_lr = copy.copy(self.sgd_lr) for i in range(self.config.backtrack_iters): backtrack_lr *= self.config.backtrack_decay self.sess.run(self.set_pi_params, feed_dict={ self.v_ph: old_params - backtrack_lr * sgd_gradient }) new_actions = self.sess.run( self.sampled_action, feed_dict={ self.observation_placeholder: observations }) bt_penalty = env.penalty(observations, new_actions) print( "...BACKTRACKING bt_penalty {}".format(bt_penalty)) if bt_penalty < sgd_penalty: print( "BACKTRACKING: improvement at iter {} bt_penalty={} sgd_penalty={}" .format(i, bt_penalty, sgd_penalty)) break if i == self.config.backtrack_iters - 1: # Nothing better found during backtracking, restore sgd_params self.sess.run(self.set_pi_params, feed_dict={self.v_ph: sgd_params}) end = timer() print("...Backtracking Time: {}".format(end - start)) else: print("USE ADAM") self.sess.run(self.train_op, feed_dict={ self.observation_placeholder: observations, self.action_placeholder: actions, self.advantage_placeholder: advantages }) # tf stuff if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, scores_eval) self.record_summary(t) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) #if self.config.use_sgd is True and avg_reward < previous_avg_reward: # self.sgd_lr = self.sgd_lr * 0.9 # print("Decay SGD LR to {}".format(self.sgd_lr)) #previous_avg_reward = avg_reward if self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.logger.info("- Training done.") export_plot(scores_eval, "Score", config.env_name, self.config.plot_output)
def train(self, beta_schedule, lr_schedule, cr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time curriculum_batch_size = np.ceil( self.config.nsteps_train / cr_schedule.n_curriculum).astype('int32') prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 config = self.config config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[ curri_idx] self.env.reset(config) # h x w x c h_state = DNC.zero_state(config, batch_size=1) encoding, predflag, target_action = self.env.prepare_seq() slen = np.array(encoding.shape[0]).astype('int32') # describe graph, query and planning h_state = self.sess.run(self.hs_out, feed_dict={ self.s: encoding[None], self.hs: h_state, self.slen: slen }) past_state = -1 past_action_onehot = -1 encoding_a = np.zeros([config.max_step_len, encoding.shape[1]]) predflag_a = np.zeros(config.max_step_len) target_action_a = np.zeros( [config.max_step_len, target_action.shape[1]]) for i in range(config.max_step_len): current_encoding = GraphWorld.convert_triplets_to_encoding( np.array([[ past_state, self.env.current_state, past_action_onehot ]]).astype('int32'), config.ndigits, config.nway) current_encoding = np.concatenate( [current_encoding, np.array([[1, 0]])], axis=1) pred_action, h_state = self.sess.run( [self.q, self.hs_out], feed_dict={ self.s: current_encoding[None], self.hs: h_state, self.slen: np.ones(1).astype('int32') }) gt_action = self.env.get_gt_action() action = self.get_action(pred_action[0], gt_action, beta_schedule.epsilon) past_state = self.env.current_state _, done, past_action_onehot = self.env.step(action) encoding_a[i, :] = current_encoding[0] predflag_a[i] = 1 target_action_a[i] = gt_action slen += 1 if done: break batch_data = (np.concatenate([encoding, encoding_a], axis=0)[None], np.concatenate([predflag, predflag_a], axis=0), np.concatenate([target_action, target_action_a], axis=0), slen) # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % config.log_freq == 0) and (t % config.learning_freq == 0)): self.update_averages(scores_eval) beta_schedule.update(t) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= config.nsteps_train: break if last_eval >= config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate(cr_schedule, curri_idx)] if scores_eval[-1] > 0.8: curri_idx += 1 msg = "Upgrade to lesson {:d}".format(int(curri_idx)) self.logger.info(msg) self.logger.info( "----------Start Computing Final Score----------") scores_eval += [self.evaluate(cr_schedule)] self.logger.info( "----------Finish Computing Final Score----------") # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(cr_schedule)] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): # Initialize replay buffer and variables replay_buffer = ReplayBuffer(self.FLAGS.buffer_size, self.FLAGS.state_hist) rewards = deque(maxlen=self.FLAGS.num_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = 0 # time control of nb of steps loss_eval = grad_eval = 0 scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] self.prog = Progbar(target=self.FLAGS.train_steps) # Train for # of train steps while t < self.FLAGS.train_steps: continual_crash = 0 try: total_reward = 0 ep_len = 0 state = self.env.reset() # Run for 1 episode and update the buffer while True: ep_len += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.network.get_best_action( q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # Count reward total_reward += reward # Stop at end of episode if done: break #Store episodic rewards if ep_len > 1: rewards.append(total_reward) # Learn using replay while True: t += 1 ep_len -= 1 # Make train step if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): loss_eval, grad_eval = self.network.update_step( t, replay_buffer, lr_schedule.epsilon, self.summary) exp_schedule.update(t) lr_schedule.update(t) if (t % self.FLAGS.target_every == 0): self.network.update_target_params() # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0) and (len(rewards) > 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) # Update logs if necessary elif (t < self.FLAGS.learn_start) and ( t % self.FLAGS.log_every == 0): sys.stdout.write( "\rPopulating the memory {}/{}...".format( t, self.FLAGS.learn_start)) sys.stdout.flush() if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)): # Evaluate current model scores_eval += [ self.evaluate(self.env, self.FLAGS.num_test) ] # Save current Model self.network.save() # Record video of current model if self.FLAGS.record: self.record() if ep_len <= 0 or t >= self.FLAGS.train_steps: break continual_crash = 0 except Exception as e: continual_crash += 1 self.logger.info(e) if continual_crash >= 10: self.logger.info("Crashed 10 times -- stopping u suck") raise e else: t -= 1 self.logger.info("Env crash, making new env") time.sleep(60) self.env = create_slither_env(self.FLAGS.state_type) self.env = Unvectorize(self.env) self.env.configure(fps=self.FLAGS.fps, remotes=self.FLAGS.remotes, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50 }) time.sleep(60) # End of training self.logger.info("- Training done.") self.network.save() scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
def train(self): last_record = 0 self.init_averages() scores_eval = [] self.plot = { 'room' + str(i): {j: [] for j in range(config.num_sub_policies)} for i in range(4) } for t in range(self.config.num_batches): # print(t, self.get_epsilon(t)) paths, total_rewards = self.sample_path(env=self.env) scores_eval += total_rewards if str(config.env_name).startswith("Fourrooms"): observations = np.expand_dims( np.concatenate([path["observation"] for path in paths]), axis=1) else: observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) returns = self.get_returns(paths) advantages = self.calculate_advantage(returns, observations) if self.config.use_baseline: self.update_baseline(returns, observations) self.sess.run( self.train_op, feed_dict={ self.observation_placeholder: observations, self.action_placeholder: actions, self.advantage_placeholder: advantages }) if t % self.config.summary_freq == 0: self.update_averages(total_rewards, scores_eval) self.record_summary(self.batch_counter) self.batch_counter = self.batch_counter + 1 avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) last_record += 1 if self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() if t % config.record_freq == 0: self.save_model_checkpoint(self.sess, self.saver, os.path.join( self.config.output_path, 'model.ckpt'), t) self.logger.info("- Training done.") export_plot(scores_eval, "Score", config.env_name, self.config.plot_output) if str(config.env_name).startswith( "Fourrooms") and config.examine_master: import matplotlib.pyplot as plt plt.rcParams["figure.figsize"] = [12, 12] f, ((ax1, ax2), (ax3, ax4)) = plt.subplots( 2, 2, sharex='col', sharey='row') axes = {'room0': ax1, 'room1': ax2, 'room2': ax3, 'room3': ax4} for room in self.plot: axes[room].set_title(room, size=20) for sub in range(config.num_sub_policies): prob_list = self.plot[room][sub] axes[room].plot( range(len(prob_list)), prob_list, linewidth=5) axes[room].legend( [ 'subpolicy' + str(sub) for sub in range(config.num_sub_policies) ], loc='upper left', prop={ 'size': 20 }) plt.tight_layout() plt.savefig('Rooms and Subs', dpi=300)
def train(self, exp_schedule, lr_schedule, choose_teacher_strategy=None): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment allsteps = [] while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: if self.config.state_subspace is not None: out_of_bounds = False if self.config.state_subspace in [ 'ball_top_half', 'ball_bottom_half' ]: image = self.env.unwrapped._get_obs() ball_position = ball_half_screen_position(image) # check if ball is in top half but we're restricted to bottom half if ball_position == 1 and self.config.state_subspace == 'ball_bottom_half': out_of_bounds = True # check if ball is in bottom half but we're restricted to top half elif ball_position == 0 and self.config.state_subspace == 'ball_top_half': out_of_bounds = True else: raise NotImplementedError if out_of_bounds: # current state is outside of this agent's state subspace # perform action in env state, reward, done, info = self.env.step(action) t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # self.q_inputs.append(q_input) # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state if choose_teacher_strategy is not None: # store the reward with the teacher choice strategy choose_teacher_strategy.store_reward(reward, q_input) # perform a training step loss_eval, grad_eval = self.train_step( t, replay_buffer, lr_schedule.epsilon, choose_teacher_strategy) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if choose_teacher_strategy is not None: choose_teacher_strategy.update_schedule(t) if len(rewards) > 0: exact = [("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)] if choose_teacher_strategy is not None and hasattr( choose_teacher_strategy, 'eps_schedule'): exact.append( ("Choose teacher eps", choose_teacher_strategy.eps_schedule.epsilon)) prog.update(t + 1, exact=exact) elif ((t > self.config.learning_start) and (t % self.config.save_teacher_choice_freq == 0) and (choose_teacher_strategy is not None)): choose_teacher_strategy.save( self.config.teacher_choice_output_path) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output) if choose_teacher_strategy is not None: choose_teacher_strategy.save( self.config.teacher_choice_output_path)
def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_eval = 0 last_record = 0 self.init_averages() scores_eval = [] # list of scores computed at iteration time episode = 0 episode_reward = 0 step = 0 episode_rewards = [] paths = [] observation = self.env.reset() observation = np.tile(observation, (self.config.batch_size, 1)) self.config.batch_size for t in range(self.config.num_batches*self.config.batch_size): if self.discrete: actions = np.arange(self.action_dim).astype(float)[:, None] actions = np.reshape(actions, (1, self.config.batch_size)) obs = np.tile(observation, (self.action_dim, 1)) self.sess.run(self.train_op, feed_dict={ self.observation_placeholder : obs, self.action_placeholder : actions}) for i in range(observation.shape[0]): action = self.sess.run(self.sampled_actions, feed_dict={self.observation_placeholder : observation[None]})[0] next_observation, reward, done, info = env.step(action) next_action = self.sess.run(self.target_sampled_actions, feed_dict={self.next_observation_placeholder : next_observation[None]})[0] episode_reward += reward step += 1 action = np.array([action])[None] next_action = np.array([next_action])[None] reward = np.array([reward])[None] done = np.array([done])[None] self.update_critic(action, next_action, observation[None], next_observation[None], reward, done) if (t > 0 and t % self.config.update_critic_freq == 0): self.sess.run(self.update_target_op, feed_dict={}) if (done or step == self.config.max_ep_len-1): episode_rewards.append(episode_reward) observation = self.env.reset() observation = np.tile(observation, (self.config.batch_size, 1)) episode_reward = 0 episode += 1 step = 0 else: observation = next_observation # tf stuff if (t % (self.config.summary_freq*self.config.batch_size) == 0 and t > 0): self.update_averages(episode_rewards, scores_eval) self.record_summary(t) if (t % self.config.batch_size == 0 and t > 0): # compute reward statistics for this batch and log avg_reward = np.mean(episode_rewards) sigma_reward = np.sqrt(np.var(episode_rewards) / len(episode_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward) self.logger.info(msg) scores_eval = scores_eval + episode_rewards episode_rewards = [] if self.config.record and (last_record > (self.config.record_freq*self.config.batch_size)): self.logger.info("Recording...") last_record =0 self.record() self.logger.info("- Training done.") export_plot(scores_eval, "Score", config.env_name, self.config.plot_output)
def train(self, exp_schedule, lr_schedule): # Initialize replay buffer and variables replay_buffer = ReplayBufferAC(self.FLAGS.buffer_size, self.FLAGS.state_hist) rewards = deque(maxlen=self.FLAGS.num_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = 0 # time control of nb of steps loss_eval = grad_eval = 0 scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] self.prog = Progbar(target=self.FLAGS.train_steps) self.prog2 = Progbar(target=self.FLAGS.train_steps) # Train for # of train steps while t < self.FLAGS.train_steps: total_reward = 0 ep_len = 0 state = self.env.reset() reward = 0 first = 1 q_input = None # Run for 1 episode and update the buffer while True: ep_len += 1 # replay memory stuff if first == 1: first = 0 idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.network.get_best_action(q_input) action = exp_schedule.get_action(best_action) orig_val = self.network.calcState(q_input) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, new_reward, done, info = self.env.step(action) idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() new_val = self.network.calcState(q_input) orig_val = orig_val[0][0] new_val = new_val[0][0] print (orig_val, new_reward, done, new_val, ep_len) if not done: # Non-terminal state. target = reward + ( self.FLAGS.gamma * new_val) else: target = reward + ( self.FLAGS.gamma * new_reward ) best_val = max((orig_val), target) actor_delta = new_val - orig_val replay_buffer.store_effect(idx-1, action, new_reward, done, best_val, actor_delta) state = new_state if done: replay_buffer.store_effect(idx, action, 0, done, 0, 0) # Count reward total_reward += new_reward reward=new_reward # Stop at end of episode if done: break old_t = t temp_ep_len = ep_len while True: t += 1 temp_ep_len -= 1 if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): if replay_buffer.can_sample(self.FLAGS.batch_size) == True: loss_eval, grad_eval = self.network.update_critic_step(t, replay_buffer, lr_schedule.epsilon, self.summary) # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)): self.update_logs2(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) if temp_ep_len <= 0 or t >= self.FLAGS.train_steps: break rewards.append(total_reward) # Learn using replay while True: t += 1 ep_len -= 1 # Make train step if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): if replay_buffer.can_sample(self.FLAGS.batch_size) == True: loss_eval, grad_eval = self.network.update_actor_step(t, replay_buffer, lr_schedule.epsilon, self.summary) exp_schedule.update(t) lr_schedule.update(t) # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) # Update logs if necessary elif (t < self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.FLAGS.learn_start)) sys.stdout.flush() if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)): # Evaluate current model scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] # Save current Model self.network.save() # Record video of current model if self.FLAGS.record: self.record() if ep_len <= 0 or t >= self.FLAGS.train_steps: break # Update episodic rewards # End of training self.logger.info("- Training done.") self.network.save() scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
def export_score(self, scores_eval): export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self): """ Performs training """ last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() scores_eval = [] # list of scores computed at iteration time # Update learning rate if self.max_roll_distance > 400.0: self.learning_rate = pow(self.learning_rate, 0.9) for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards, rollout_distances = self.sample_path() scores_eval = scores_eval + total_rewards observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time step returns = self.get_returns(paths) advantages = self.calculate_advantage(returns, observations) #Check if current model is best: if max(rollout_distances) > self.max_max_roll_distance: print('New best model found! Saving under: ', self.config.best_model_output) self.saver.save(self.sess, self.config.best_model_output) # run training operations if self.config.use_baseline: self.update_baseline(returns, observations) self.sess.run(self.train_op, feed_dict={ self.observation_placeholder: observations, self.action_placeholder: actions, self.advantage_placeholder: advantages, self.lr: self.learning_rate }) # tf stuff if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, scores_eval, rollout_distances) self.record_summary(t) print("Learning rate:", self.learning_rate) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) self.saver.save(self.sess, self.config.model_output) self.logger.info("- Training done.") export_plot(scores_eval, "Score", config.env_name, self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables if not self.config.batch: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history ) else: self.logger.info( 'Loading replay buffer from {}'.format(self.config.buffer_path) ) replay_buffer = ReplayBuffer.load(self.config.buffer_path) self.logger.info( 'Loaded buffer with {} observations and {} in buffer'.format( len(replay_buffer.obs), replay_buffer.num_in_buffer ) ) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) episode_lengths = deque(maxlen=1000) max_episode_length = 0 self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 if not self.config.batch: state = self.env.reset() episode_step = 0 avg_episode_length = ( np.nan if len(episode_lengths) == 0 else np.mean(episode_lengths) ) while True: t += 1 episode_step += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() if not self.config.batch: get_action = functools.partial( exp_schedule.get_action, episode_num=len(episode_lengths), episode_step=episode_step, avg_episode_length=avg_episode_length ) state, reward, done, _q_values = self.interact( replay_buffer, state, get_action ) else: reward = 0 done = True _q_values = [0] # store q values max_q_values.append(max(_q_values)) q_values.extend(list(_q_values)) # perform a training step loss_eval, grad_eval = self.train_step( t, replay_buffer, lr_schedule.epsilon ) # logging stuff learning = (t > self.config.learning_start) learning_and_loggging = ( learning and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0) ) if learning_and_loggging: self.update_averages( rewards, max_q_values, q_values, scores_eval, episode_lengths, max_episode_length ) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: if self.config.batch: exact = [ ("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon), ] else: exact = [ ("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max_Q", self.max_q), ("lr", lr_schedule.epsilon), ("avg_ep_len", avg_episode_length) ] prog.update(t + 1, exact=exact) elif not learning and (t % self.config.log_freq == 0): sys.stdout.write( "\rPopulating the memory {}/{}...".format( t, self.config.learning_start ) ) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: episode_lengths.append(episode_step) if episode_step > max_episode_length: max_episode_length = episode_step # retrain the clusters every time the max episode # length changes if hasattr(self, 'reset_counts'): self.reset_counts( n_clusters=max_episode_length, states=replay_buffer.get_encoded_states(), actions=replay_buffer.get_actions() ) break # updates to perform at the end of an episode rewards.append(total_reward) should_evaluate = ( (t > self.config.learning_start) and (last_eval > self.config.eval_freq) ) if should_evaluate: # evaluate our policy last_eval = 0 print("") scores_eval.append(self.evaluate()) should_record = ( (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq) ) if should_record: self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval.append(self.evaluate()) export_plot(scores_eval, "Scores", self.config.plot_output) if not self.config.batch: # save replay buffer self.logger.info( 'Saving buffer to {}'.format(self.config.buffer_path) ) replay_buffer.save(self.config.buffer_path)
def train(self, model_a, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train model_a.env.state.is_render_image = model_a.config.render_train orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])] # interact with environment while t < self.config.nsteps_train: total_reward = 0 flag = True while flag: state = self.env.reset() # h x w x c agent_location = self.env.state.agent_location if self.env.teacher.dist_map[agent_location[1],agent_location[0]]!=np.inf: flag = False model_a.env.reset() model_a.env.state.copy_state(model_a.env.agent, self.env.state) h_state_fw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size])) h_state_bw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size])) state_batch = list() goal_state_batch = list() goal_obs_image_batch = list() path_loc = list() path_ori = list() done_batch = list() width, height = self.env.state.xmap.dim['width'], self.env.state.xmap.dim['height'] side_radius = min(self.config.visible_radius_unit_side, max(width - 1, height - 1)) block_size = self.env.state.image_block_size for i in range(200): #### teacher rotate #### agent_location = self.env.state.agent_location agent_orientation = self.env.state.agent_orientation goal_location = agent_location+agent_orientation gt_action = self.env.teacher.action_map[agent_location[1], agent_location[0]] if np.dot(agent_orientation, orientation_map[gt_action])!=1: tmp = np.cross(agent_orientation, orientation_map[gt_action]) if tmp==1: state, reward_i, done = self.env.step(3) else: state, reward_i, done = self.env.step(2) continue path_loc.append(copy.deepcopy(goal_location)) path_ori.append(copy.deepcopy(agent_orientation)) raw_goal_state, goal_state = self.convert_state_to_goal_state(state) state_batch.append(raw_goal_state[None][None]) goal_state_batch.append(goal_state) if self.config.render_train: goal_obs_image_batch.append(self.env.state.image[:3*block_size, (side_radius-1)*block_size:(side_radius+2)*block_size, :]) state, reward_i, done = self.env.step(0) done_batch.append(done) if done: break slen = np.array([len(state_batch)]).astype('int32') state_batch = np.concatenate(state_batch, axis=1) best_action_batch, q_values_batch, h_state_fw, h_state_bw = self.get_best_action_batch(state_batch, h_state_fw, h_state_bw, slen) action_batch = exp_schedule.get_action_batch(best_action_batch) for i in range(q_values_batch.shape[0]): max_q_values.append(max(q_values_batch[i])) q_values += list(q_values_batch[i]) reward_batch = list() for i, action in enumerate(action_batch): if action==0: reward_batch.append(0) else: if self.config.render_train: model_a.env.teacher.goal_obs_image = goal_obs_image_batch[i] h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) model_a.env.teacher.set_goal(goal_state_batch[i], path_loc[i]) reward_a = model_a.navi_goal(h_state_a, goal_state_batch[i]) if model_a.env.teacher.goal_finish: reward_batch.append(-0.05) else: reward_batch.append(-0.1) #model_a.env.state.teleport(model_a.env.agent, path_loc[i], path_ori[i]) if action_batch[-1]==1 and model_a.env.teacher.goal_finish: reward_batch[-1] += 1 else: if self.config.render_train: model_a.env.teacher.goal_obs_image = goal_obs_image_batch[-1] h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) model_a.env.teacher.set_goal(goal_state_batch[-1], path_loc[-1]) reward_a = model_a.navi_goal(h_state_a, goal_state_batch[-1]) if model_a.env.teacher.goal_finish: reward_batch[-1] += 1 for i in range(action_batch.shape[0]): idx = replay_buffer.store_frame(state_batch[0][i]) replay_buffer.store_effect(idx, action_batch[i], reward_batch[i], done_batch[i]) for i in range(action_batch.shape[0]): t += 1 last_eval += 1 last_record += 1 # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward = sum(reward_batch) # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d"%(t)) scores_eval += [self.evaluate(model_a)] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(model_a)] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables if self.config.use_memory: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history, memory_size=self.config.memory_unit_size) else: replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()[0]] prog = Progbar(target=self.config.nsteps_train) evaluation_result_list = [] oos_evalution_result_list = [] # interact with environment prev_time = time.time() while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() if self.config.use_memory: prev_memory = replay_buffer.encode_recent_memory() best_action, q_values, _, next_memory = self.get_best_action_with_memory( q_input, prev_memory) next_memory = np.squeeze(next_memory) else: best_action, q_values = self.get_best_action(q_input) # chose action according to current Q and exploration action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) if self.config.use_memory: replay_buffer.store_memory(idx, next_memory) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff time_log_freq = 1000 if t % time_log_freq == 0: with open(self.config.output_path + 'time_log.txt', 'a') as of: of.write('{}\n'.format(time.time() - prev_time)) of.write('\n') prev_time = time.time() if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max_Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") score, complete, length = self.evaluate() if complete > 0: evaluation_result_list += [(score, complete, length)] if score > self.config.extended_eval_threshold: self.logger.info('Extended in-sample evaluation...') self.evaluate(num_episodes=1000) for _ in range(10): self.logger.info( 'Extended out-of-sample evaluation...') oos_result = self.evaluate( EnvMaze(n=self.config.maze_size), num_episodes=100) oos_evalution_result_list += [oos_result] scores_eval += [score] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()[0]] export_plot(scores_eval, "Scores", self.config.plot_output) return evaluation_result_list, oos_evalution_result_list
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history, self.config) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action, explore = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done, explore) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon, exp_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record =0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() # h x w x c goal_state = self.env.teacher.goal_obs_onehot_state # h x w x c h_state = (np.zeros([1, self.config.h_size]), np.zeros([1, self.config.h_size])) slen = np.ones(1).astype('int32') action = 0 for i in range(200): t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() #### for replay_buffer # replay memory stuff idx = replay_buffer.store_frame(state, goal_state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration curr_attention = np.equal( np.sum(np.equal(q_input, goal_state[None][None][None]), 3), q_input.shape[3]) best_action, q_values, h_state = self.get_best_action( [q_input], curr_attention[None], h_state, slen, [action]) #best_action, q_values, h_state = self.get_best_action([q_input], goal_state[None][None], h_state, slen, [action]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)