help='Save the experiement at the end') args = parser.parse_args() try: N = int(args.training) except ValueError: if args.training == 'full': N = 60000 else: print 'Couldn\'t cast the number of training. Used 10000' N = 10000 max_epochs = 15 max_epochs_ft = args.epochs if args.debug: N = N / 10 max_epochs = 10 max_epochs_ft = max(10, max_epochs_ft) model = Experience(N, name=args.name, disp=args.img, noise=args.noise) if not model.exists: model.pretrain(epochs=max_epochs, lr=0.1) model.save() print 'Pretraining done\n' model.fine_tune(epochs=max_epochs_ft, lr=0.05, dropout=args.dropout, lcost=args.lcost) model.eval_perf() model.save()
def run_episode(self): #print('reseting \n') self.env.reset() done = False experiences = [] time_count = 0 while (not done) and time_count <= Config.TIME_MAX: # very first few frames if len(self.env.current_state) == 0: reward, done, p_mask, region_now = self.env.step( np.random.randint(0, 4, 1)) #print('reset \n') #self.env.step(5) #0 == NOOP if Config.PLAY_MODE and done == False: draw_gif_sequences_test(time_count, region_now, self.env.img_name, save_boolean=1) elif Config.PLAY_MODE and done == True: drawing_gif(self.env.img_name) time_count += 1 continue #print('shape: ' + str(self.env.current_state) + '\n') prediction, value = self.predict(self.env.current_state) action = self.select_action(prediction) if time_count < Config.TIME_MAX: #action = 5 reward, done, p_mask, region_now = self.env.step(action) exp = Experience(self.env.previous_state, action, reward, reward, done, p_mask) experiences.append(exp) else: reward, done, p_mask, region_now = self.env.step(4) exp = Experience(self.env.previous_state, 4, reward, reward, done, p_mask) experiences.append(exp) if Config.PLAY_MODE and done == False: draw_gif_sequences_test(time_count, region_now, self.env.img_name, save_boolean=1) elif Config.PLAY_MODE and done == True: drawing_gif(self.env.img_name) if done: terminal_reward = 0 if done else value updated_exps = ProcessAgent._accumulate_rewards( experiences, self.discount_factor, terminal_reward) x_, r_, r0_, a_, p_mask_ = self.convert_data(updated_exps) #print('time: ' + str(time_count) + ', done: ' + str(done) + ', p_mask: ' + str(p_mask) + ', reward: ' + str(reward_all) + ', action:' + str(action) + ' ,a_:' + str(a_.shape) + '\n') # keep the last experience for the next batch experiences = [] yield x_, r_, r0_, a_, p_mask_, self.env.img_name time_count += 1
for x in range(LEARNING_SAMPLE_SIZE): picked.append(self.storage[random.randint(0, self.index - 1)]) return picked def print_storage(self, range_lower=None, range_upper=None): # For testing purposes if range_lower == None and range_upper == None: print(self.storage) else: print(self.storage[range_lower:range_upper]) if __name__ == "__main__": # Perform tests ds = ReplayMemory() for x in range(0, 3): y = Experience(x, x, x, x) ds.store(y) ds.print_storage(0, 4) print(ds.get_random(32)) # Numpy array results, n = 30,000,000: # real 0m13.407s # user 0m13.078s # sys 0m0.279s # Python list results, n = 30,000,000: # real 0m15.228s # user 0m14.727s # sys 0m0.499s
def run_episode(self): # Initialize self.env.reset() game_over = False experiences = [[] for i in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)] updated_exps = [None for i in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)] updated_leftover_exps = [None for i in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)] time_counts = np.zeros((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)) reward_sum_logger = np.zeros((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)) which_agents_done_and_trained = np.full((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT), False, dtype=bool) while not game_over: # Initial step # if self.env.current_state is None: # if Config.DEBUG: print('[ DEBUG ] ProcessAgent::Initial step') # self.env.step(-1, self.pid, self.count)# Action 0 corresponds to null action # # self.count += 1 # continue actions = {} predictions = np.empty((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT,Config.NUM_ACTIONS)) values = np.empty((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)) for i, agent_observation in enumerate(self.env.latest_observations): # print("Agent: {}. Obs: {}".format(i, agent_observation)) is_agent_running_ga3c = agent_observation[0] # print("is_agent_running_ga3c: {}".format(is_agent_running_ga3c)) if not is_agent_running_ga3c: continue # Prediction # print("[ProcessAgent]", "i:", i, "agent_observation:", agent_observation) prediction, value = self.predict(agent_observation) # Select action action = self.select_action(prediction) predictions[i] = prediction values[i] = value actions[i] = action # print("action", actions[i]) # print("actions:", actions) # Take action --> Receive reward, done (and also store self.env.previous_state for access below) rewards, game_over, infos = self.env.step([actions], self.pid, self.count) rewards = rewards[0] # Only use 1 env from VecEnv if Config.TRAIN_SINGLE_AGENT: rewards = np.expand_dims(rewards, axis=0) # Make the single agent's reward look like a list of agents' rewards which_agents_done = infos[0]['which_agents_done'] which_agents_learning = infos[0]['which_agents_learning'] num_agents_running_ga3c = np.sum(list(which_agents_learning.values())) # print("which_agents_done: {}".format(which_agents_done)) # print("which_agents_learning: {}".format(which_agents_learning)) for i in which_agents_learning.keys(): # Loop through all feedback from environment (which may not be equal to Config.MAX_NUM_AGENTS) if not which_agents_learning[i]: continue # Reward reward_sum_logger[i] += rewards[i] prediction = predictions[i] value = values[i] action = actions[i] reward = rewards[i] done = which_agents_done[i] # Add to experience exp = Experience(self.env.previous_state[0,i,:], action, prediction, reward, done) experiences[i].append(exp) # If episode is done # Config.TIME_MAX controls how often data is yielded/sent back to the for loop in the run(). # It is used to ensure, for games w long episodes, that data is sent back to the trainers sufficiently often # The shorter Config.TIME_MAX is, the more often the data queue is updated if which_agents_done[i] or time_counts[i] == Config.TIME_MAX and which_agents_done_and_trained[i] == False: if which_agents_done[i]: terminal_reward = 0 which_agents_done_and_trained[i] = True else: terminal_reward = value updated_exps[i], updated_leftover_exps[i] = self._accumulate_rewards(experiences[i], self.discount_factor, terminal_reward, which_agents_done[i]) x_, r_, a_ = self.convert_to_nparray(updated_exps[i])# NOTE if Config::USE_AUDIO == False, audio_ is None yield x_, r_, a_, reward_sum_logger[i] / num_agents_running_ga3c # sends back data without quitting the current fcn reward_sum_logger[i] = 0.0 # NOTE total_reward_logger in self.run() accumulates reward_sum_logger, so it is correct to reset it here if updated_leftover_exps[i] is not None: # terminal_reward = 0 x_, r_, a_ = self.convert_to_nparray(updated_leftover_exps[i]) # NOTE if Config::USE_AUDIO == False, audio_ is None yield x_, r_, a_, reward_sum_logger[i] # TODO minor figure out what to send back in terms of rnn_state. Technically should be rnn_state[-1]. # Reset the tmax count time_counts[i] = 0 # Keep the last experience for the next batch experiences[i] = [experiences[i][-1]] time_counts[i] += 1 self.count += 1
def run_episode(self): self.env.reset() game_done = False experiences = [] time_count = 0 frame_count = 0 reward_sum_logger = 0.0 if Config.USE_OPTIONS: self.option_terminated = True if Config.USE_RNN: # input states for prediction rnn_state = CustomLayers.RNNInputStateHandler.get_rnn_dict( init_with_zeros=True, n_lstm_layers_total=self.model.n_lstm_layers_total) # input states for training init_rnn_state = CustomLayers.RNNInputStateHandler.get_rnn_dict( init_with_zeros=True, n_lstm_layers_total=self.model.n_lstm_layers_total) else: rnn_state = None init_rnn_state = None if self.id == 0 and self.is_option_tracker_on: self.option_tracker._reset_tracker(vis_count) while not game_done: # Initial step (used to ensure frame_q is full before trying to grab a current_state for prediction) if Config.USE_AUDIO and (self.env.current_state[0] is None and self.env.current_state[1] is None): self.env.step(0) # Action 0 corresponds to null action continue elif self.env.current_state is None: self.env.step(0) # Action 0 corresponds to null action continue if self.is_option_tracker_on: agt_loc = self.env.game.agent_loc # Option prediction if Config.USE_OPTIONS: if self.option_terminated: i_option = 0 # NOTE Fake option input prediction_dict = self.predict(self.env.current_state, rnn_state, i_option) i_option = self.select_option( prediction_dict ) # NOTE Select option correctly in here else: i_option = None # Primitive action prediction (for option and non-option cases) if self.id == 0: print("frame_count {}, i_option: {}".format( frame_count, i_option)) prediction_dict = self.predict(self.env.current_state, rnn_state, i_option) # Update rnn_state if Config.USE_RNN: rnn_state = prediction_dict['rnn_state_out'] # Visualize train process or test process if self.id == 0: if Config.USE_ATTENTION: self.vis_attention_i.append(prediction_dict['attn'][0]) self.vis_attention_a.append(prediction_dict['attn'][1]) else: self.vis_attention_i = None self.vis_attention_a = None self.env.visualize_env(self.vis_attention_i, self.vis_attention_a, vis_count) # Select action i_action = self.select_action(prediction_dict) # Take action --> Receive reward, game_done (and also store self.env.previous_state for access below) reward, game_done = self.env.step(i_action) reward = np.clip(reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.USE_OPTIONS: reward -= float( self.option_terminated ) * self.model.option_cost_delib * float(frame_count > 1) self.option_terminated = prediction_dict['option_term_probs'][ i_option] > np.random.rand() reward_sum_logger += reward # Add to experience if Config.USE_AUDIO: exp = Experience(self.env.previous_state[0], self.env.previous_state[1], i_action, i_option, reward, game_done) else: exp = Experience(self.env.previous_state, None, i_action, i_option, reward, game_done) experiences.append(exp) # Plot option trajectories if self.id == 0 and self.is_option_tracker_on: self.option_tracker._update_tracker(agt_loc, i_option, self.option_terminated) self.option_tracker._plot_tracker() # Config.TIME_MAX controls how often data is yielded/sent back to the for loop in the run(). # It is used to ensure, for games w long episodes, that data is sent back to the trainers sufficiently often # The shorter Config.TIME_MAX is, the more often the data queue is updated if game_done or time_count == Config.TIME_MAX: # or self.option_terminated: if Config.USE_OPTIONS: if self.option_terminated: value = prediction_dict[ 'option_v_model'] - self.model.option_cost_delib * float( frame_count > 1) else: value = prediction_dict['option_q_model'][i_option] terminal_reward = 0 if game_done else value else: terminal_reward = 0 if game_done else prediction_dict[ 'v'] # Ref: A3C Algorithm S2 (n-step q-learning) updated_exps, updated_leftover_exp = ProcessAgent._accumulate_rewards( experiences, self.discount_factor, terminal_reward, game_done) x_, audio_, r_, a_, o_ = self.convert_to_nparray(updated_exps) yield x_, audio_, r_, a_, o_, init_rnn_state, reward_sum_logger reward_sum_logger = 0.0 # NOTE total_reward_logger in self.run() accumulates reward_sum_logger so reset here if updated_leftover_exp is not None: x_, audio_, r_, a_, o_ = self.convert_to_nparray( updated_leftover_exp) yield x_, audio_, r_, a_, o_, init_rnn_state, reward_sum_logger # Reset the tmax count time_count = 0 # Keep the last experience for the next batch experiences = [experiences[-1]] if Config.USE_RNN: init_rnn_state = rnn_state time_count += 1 frame_count += 1
def logArrival(self, simtime, stageId): self._experience.update({stageId: Experience(stageId, simtime)}) self._currLocation = stageId self._totalWaitTime = math.nan self._totalSystemTime = math.nan