def run_bbox(verbose=False): ''' Runs the Blackbox challenge. ''' has_next = True prepare_bbox() while has_next: ## Observe the current state variables state = bbox.get_state() state_tuple = get_state_tuple(state) ## Select the current action action = get_action(state_tuple, verbose=verbose, is_current=True) ## Get the current reward reward = bbox.get_score() print 'Reward = ' + str(reward) ## Retrieve the current Q-value current_q = q_function[state_tuple][action] print 'Current Q = ' + str(current_q) ## Observe the next state (assuming there always is) has_next = bbox.do_action(action) next_state = bbox.get_state() next_state_tuple = get_state_tuple(next_state) ## Get the best q_action in the new state next_action = get_action(next_state_tuple, verbose=verbose, is_current=False) ## Get the new Q_value next_q = q_function[next_state_tuple][next_action] ## Update the Q-function q_function[state_tuple][action] = (1 - alpha) * current_q + alpha * (reward + gamma * next_q) print 'Updated Q = ' + str(q_function[state_tuple][action]) bbox.finish(verbose=True)
def get_state(self): #im_size = (self.grid_size,) * 2 #state = self.state[0] #canvas = np.zeros(im_size) #canvas[state[0], state[1]] = 1 #canvas[-1, state[2]-1:state[2] + 2] = 1 return bbox.get_state() #canvas
def eval_game(game: Game, dqn: DQN, action, q_vals, queue, root_index, root=True): """ Called by look_ahead function. Used to evaluate a state, update Q value, enumerate and enqueue possible child actions. By default, this treats the root actions first. Args: game, A Game object to be evaluated dqn, A deep Q learning network object to evaluate state action, A tuple representing an action and its optional target q_vals, A shared mem array for the global Q vales queue, A Queue to store child actions root_index, The index of the root action in q_vals root(=True), Whether or not these are the root actions Returns: self """ # (local) copy game object, perform action, get state feature vector, evaluate perform_action(action, game.current_player, game) state = get_state(game) #Pass to Tensorflow here to evaluate s_val = dqn.get_q_value(state, "dqn") print("Action:", action) print("Q value: %f", s_val) """
def run_bbox(): f_35_penalty = 0.15; k = 0; w0 = 0.13 bbox.load_level("levels/test_level.data", verbose=0) has_next = True; last_score = 0 act = -1; act_len = 0; crit_len = 150 predict = np.zeros(2); cum_sum = np.zeros(4) while has_next: last_act = act state = bbox.get_state() predict[:2] = np.dot(lr_coefs_1,state[:-1]) + lr_free_coefs_1 if state[35] > 0: cum_sum[1] = predict[0] + k cum_sum[2] = -predict[0] + k elif state[35] < 0: cum_sum[1] = -predict[1] + k cum_sum[2] = predict[1] + k elif state[35] == 0: cum_sum[1] = predict[0] + k cum_sum[2] = predict[1] + k cum_sum[0] = (cum_sum[1]+cum_sum[2])/2 + k cum_sum[1]-=f_35_penalty*state[35] cum_sum[2]+=f_35_penalty*state[35] if act_len > crit_len: cum_sum[last_act]-=0.0078125 act = (w0*(np.dot(lr_coefs_0,state) + lr_free_coefs_0)/6.366 + (1-w0)*cum_sum).argmax() has_next = bbox.do_action(act) if last_act==act: act_len+=1 else: act_len = 0 bbox.finish(verbose=1)
def run_bbox(rnet_model, train_data, train_level=True, verbose=True): """ Run a single session of the black box training or test environments :param rnet_model: model with a get_action(state) method :param train_data: a DataSet object used to buffer each state :param train_level: boolean, run the training level if True :param verbose: boolean, display additional information if True :return: float, the final session score """ has_next = 1 prepare_bbox(train_level) train_data.clear_buffer() while has_next: step_count = bbox.get_time() train_data.update_buffer(bbox.get_state()) state = train_data.get_buffer() action = rnet_model.get_action(state) has_next = bbox.do_action(action) if step_count % 5000 == 0 and verbose: print("time = %d, score = %f" % (step_count, bbox.get_score())) final_score = bbox.finish(verbose=1) return final_score
def run_bbox(rnet_model, train_data, train_level=True, verbose=True): """ Run a single session of the black box training or test environments :param rnet_model: model with a get_action(state) method :param train_data: a DataSet object used to buffer each state :param train_level: boolean, run the training level if True :param verbose: boolean, display additional information if True :return: float, the final session score """ has_next = 1 prepare_bbox(train_level) train_data.clear_buffer() while has_next: step_count = bbox.get_time() train_data.update_buffer(bbox.get_state()) state = train_data.get_buffer() action = rnet_model.get_action(state) has_next = bbox.do_action(action) if step_count % 5000 == 0 and verbose: print ("time = %d, score = %f" % (step_count, bbox.get_score())) final_score = bbox.finish(verbose=1) return final_score
def run_bbox(verbose=False): has_next = 1 prepare_bbox() #vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(1,n_features)) #Load net into the agent object agent=prepare_agent(input_var) attempt = lasagne.layers.get_output(agent) #function to do all of the stuff above eval_fn = theano.function([input_var], attempt,on_unused_input='ignore') #time to check how long it takes to run start = time.time() error=0 steps=0 while has_next: state = bbox.get_state() r_state= np.reshape(state,(1,n_features)) attempt = eval_fn(r_state) action = np.argmax(attempt) steps+=1 if steps%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" training loss: {}".format(error/steps)) print (" current score: {}".format(score)) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) print ("{} steps total".format(steps)) np.savez('model.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def run_bbox(verbose=False): ''' Runs the Blackbox challenge. ''' has_next = True ## Prepare the environment -- load the game level prepare_bbox() while has_next: ## Get the current environment state vector state = bbox.get_state() ## Choose an action to perform at the current state action = get_action_by_state(state, verbose=verbose) ## Function do_action(action) returns False if the level ## is finished; otherwise, it returns True has_next = bbox.do_action(action) ## Save the interactions as an output CSV file headers = interaction_list.pop(0) interaction_df = pd.DataFrame(interaction_list, columns=headers) datetime_int = int(calendar.timegm(time.gmtime())) out_filename = '../output/interaction_' + str(datetime_int) + '.csv' interaction_df.to_csv(out_filename, index=False) print 'Saved to file: ' + out_filename ## When submitting solution, make sure to call finish(), which returns the sum of points obtained ## during the entire simulation. This number is used as the public leader board score bbox.finish(verbose=True)
def reset(self): #n = np.random.randint(0, self.grid_size-1, size=1) #m = np.random.randint(1, self.grid_size-2, size=1) if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../../../levels/train_level.data", verbose=1) self.state = bbox.get_state() #np.asarray([0, n, m])[np.newaxis]
def update(self, action): self._actions_log.append(action[0]) self._steps += 1 self._prev_score = bbox.get_score() self._is_over = not bbox.do_action(action[0]) self._state = bbox.get_state().reshape(self._state_shape) #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over return self.state, self.reward(), self.is_over
def act(self, action): self._actions_log.append(action) self._steps += 1 self._prev_score = bbox.get_score() self._is_over = not bbox.do_action(action) self._state = bbox.get_state().reshape((1, self.n_features)) #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over return self.state, self.reward(), self.is_over
def action_lookup(model, train_data, step_inc): """ At any given point, use action_lookup to determine the ideal action from the current state. Use the behavior of the model following each possible action to determine that which brings the greatest reward. :param model: object with a get_action method for action inference :param train_data: DataSet object used for bbox state buffering :param step_inc: int, the number of state steps to increment for each possible action of action_n total actions :return: (int, float), the tuple representing the highest scoring action """ # Create a checkpoint to revert to after each action lookup start_checkpoint = bbox.create_checkpoint() # Similarly, create a backup of the DataSet object state buffer train_data.backup_buffer() best_score = -1e9 best_action = -1 # Perform the forward lookup for all valid actions for action_idx in xrange(action_n): start_score = bbox.get_score() bbox.do_action(action_idx) train_data.update_buffer(bbox.get_state()) # After the initial action selection, use the model inference to # continue step_inc states into the future for _ in xrange(step_inc): action = model.get_action(train_data.get_buffer()) bbox.do_action(action) train_data.update_buffer(bbox.get_state()) # Check the score delta step_inc steps after the initial aciton end_score = bbox.get_score() score_delta = end_score - start_score if score_delta > best_score: best_score = score_delta best_action = action_idx bbox.load_from_checkpoint(start_checkpoint) train_data.restore_buffer() return best_action, best_score
def run_bbox(verbose=False): has_next = 1 prepare_bbox() while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) bbox.finish(verbose=1)
def run_bbox(): has_next = 1 prepare_bbox() load_regression_coefs("reg_coefs.txt") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) bbox.finish(verbose=1)
def run_bbox(): global ensamble has_next = 1 prepare_bbox() ensamble=Ensemble.NN_Ensemble(n_features,4,[[36,64,4],[16,4],[16,4],[36,64,4]],n_actions) ensamble.read_weights("weights") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) if(bbox.get_time()%10000==0): print(str(bbox.get_time())+" "+str(bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): global ensamble has_next = 1 prepare_bbox() ensamble = Ensemble.NN_Ensemble( n_features, 4, [[36, 64, 4], [16, 4], [16, 4], [36, 64, 4]], n_actions) ensamble.read_weights("weights") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) if (bbox.get_time() % 10000 == 0): print(str(bbox.get_time()) + " " + str(bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): start_time = time.time() has_next = 1 prepare_bbox() coefs = load_regression_coefs("star 13-best_coefs_score=2980.401123046875_sigma=0.0010000000474974513_level=train_level.txt") state = np.ones(n_features + 1) while has_next: state[:-1] = bbox.get_state() action = get_action_by_state(state, coefs) has_next = bbox.do_action(action) bbox.finish(verbose=1) end_time = time.time() print(end_time - start_time)
def run_bbox(verbose=False): has_next = 1 # Prepare environment - load the game level prepare_bbox() while has_next: # Get current environment state state = bbox.get_state() # Choose an action to perform at current step action = get_action_by_state(state) # Perform chosen action # Function do_action(action) returns False if level is finished, otherwise returns True. has_next = bbox.do_action(action) # Finish the game simulation, print earned reward # While submitting solutions, make sure that you do call finish() bbox.finish(verbose=1)
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] with open('utility_models.pkl', 'rb') as f: utility_models = pickle.load(f) step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() action = np.random.choice(n_actions) utilities = [m.predict([state]) for m in utility_models] action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) if verbose and step % 10000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)
def run_bbox(verbose=False): has_next = 1 # Prepare environment - Load the game level prepare_box() while has_next: # Get current environment state state = bbox.get_state() # Choose an action to perform at current step action = get_action_by_state(state) # Perform chosen action # Function do_action(action) returns False if level is finished, # Otherwise returns True has_next = bbox.do_action(action) # Finish the game simulation, print earned reward # While submitting solutions make sure you do call finish() bbox.finish(verbose=1)
def run_bbox(verbose=False): has_next = 1 prepare_bbox() # vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(memtime,n_f+2)) #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = lasagne.layers.get_output(agent)[0] #function to do all of the stuff above test_fn = theano.function([input_var], attempt) # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() consequence=0 steps=0 while has_next: memory = forget(memory) state = bbox.get_state() memory[0][:-2]=state choices = test_fn(memory) action = np.argmax(choices) has_next = bbox.do_action(action) score = bbox.get_score() consequence=score-consequence memory[0][-2:] = [action,consequence] steps+=1 if steps%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" current score: {}".format(score)) print ("Final Score: {}".format(score)) print ("Time to run: {} seconds".format(time.time()-start)) bbox.finish(verbose=1)
def run_bbox(verbose=False): has_next = 1 prepare_bbox() #vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(1,n_features)) #vector of the scores for 100 of the same action target_var = T.dvector('scores') target_var = T.reshape(target_var,(1,n_actions)) #Load net into the agent object agent=prepare_agent(input_var) #what the agent thinks will happen if it does each action 100 times attempt = lasagne.layers.get_output(agent) #how much the agent was wrong, and should be punished punish = lasagne.objectives.squared_error(attempt,target_var) punish = punish.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=.1,momentum=.9) #function to do all of the stuff above train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore') #time to check how long it takes to run start = time.time() while has_next: state = bbox.get_state() r_state= np.reshape(state,(1,n_features)) scores = get_all_scores(state) r_scores = np.reshape(scores,(1,n_actions)) action = T.argmax(scores) error = train_fn(r_state,r_scores) print (error) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) bbox.finish(verbose=1)
def learn_bbox(rnet_model, train_data, update_inc=5000, lookup_inc=250, seed_data=False): """ Add training instances to train_data from a single run-through of a bbox session. :param rnet_model: model object with get_lreg_action and get_action methods :param train_data: DataSet object used to buffer states and append new training instances :param update_inc: int, number of steps between each nnet model update :param lookup_inc: int, number of forward action lookup steps :param seed_data: boolean, sets best_action is the action returned by the lreg model. :return: int, the number of action errors, or differences between actions produced by the rnet_model and the ideal or seed model. """ has_next = 1 error_count = 0 rand_count = 0 rand_idx = rand_n prepare_bbox() # For each new state in the session, add it to the data set's state # buffer so that historical states are included in a commit event train_data.clear_buffer() current_state = bbox.get_state() train_data.update_buffer(current_state) while has_next: # If all random values have been used, generate a new batch if rand_idx >= (rand_n-1): rand_vals = numpy.random.random_sample(size=(rand_n)) rand_idx = 0 step_count = bbox.get_time() # Get the next action from the model based on the current set of # buffered states action = rnet_model.get_action(train_data.get_buffer()) # Every update_inc steps train the model's network with newly # acquired training data if step_count % update_inc == 0: rn_model.run_training(train_data, max_steps=update_nnet, restore=True) error_count = 0 rand_count = 0 # If the random value is less than or equal to the sample # probability, sample the current session state and determine the # best action, adding it to the training set if necessary elif rand_vals[rand_idx] <= sample_prob: if seed_data: best_action = rnet_model.get_lreg_action(current_state) score_delta = 0.1 else: best_action, score_delta = action_lookup(rnet_model, train_data, lookup_inc) if action != best_action: train_data.commit_buffer(best_action, score_delta) error_count += 1 rand_count += 1 # Add random variation to the session by performing a random action # if less than or equal to perturb probability if rand_vals[rand_idx+1] <= perturb_prob: action = numpy.random.randint(0,4) step_inc = numpy.random.randint(rand_min, rand_max) for _ in xrange(step_inc): has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) else: has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) rand_idx += 2 if step_count % 5000 == 0: print ("time = %d, score = %f" % (step_count, bbox.get_score())) print ("errors = %d, samples = %d" % (error_count, rand_count)) #rn_model.print_stats() bbox.finish(verbose=1) return error_count
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.tensor3('memory') input_var= T.reshape(input_var,(memtime,1,n_f+2)) #Score after the agent makes it's choice reality = T.vector('score_diffs') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks their best choice is this event evaluation = lasagne.layers.get_output(agent)[0] #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(evaluation,reality) reward = reward.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9) #A function to get the agent's choice of what to try this time decide_fn = theano.function([input_var],evaluation) #function to do all of the stuff above train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run start = time.time() for epoch in range(epochs): memory = np.zeros(shape=(memtime,1,n_f+2)) e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=error=0 steps=0 trust=0.00+.02*epoch good=0 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #get best action based on 100 step checkpoint method actuals = get_all_score_diffs(state) #upload new state, with no score or action chosen memory[0][0][:-2] = state if rand.random()>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = decide_fn(memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best if action == np.argmax(actuals): good = good+1 #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) #find consequenquence score = bbox.get_score() consequence=score-consequence #train on choices just made and memory memory[0][0][-2:]=[action,consequence] error += train_fn(memory,actuals) #train based on the score change #updating for next loop steps += 1 #occasionally check in on progress if steps%10000==0: score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" current trust: {}".format(trust)) print (" avg error: {}".format(error/steps)) print (" bad choices: {}%".format(100-float(good)/100)) print (" current score: {}".format(score)) if trust<.95: trust = trust+.02 bbox.clear_all_checkpoints() ch=ra=good=0 #report on model quality on previous epoch score = bbox.get_score() with open("epoch_data.txt","a") as f: f.write("Epoch: {} Final Score: {} Average Error: {} Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60)) #save model parameters np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent)) #reset box for next epoch if(epoch<epochs-1): bbox.reset_level() print ("Time to run: {} hours".format((time.time()-start)/3600)) bbox.finish(verbose=1)
load_weights = True if training: for i in range(exploration_epochs): print(i, epsilon, gamma, action_repeat, update_frequency, batchSize, buffer) run_bbox(verbose=0, epsilon=epsilon, gamma=gamma, action_repeat=action_repeat, update_frequency=update_frequency, batchSize=batchSize, buffer=buffer, load_weights=False, save_weights=True) if epsilon > 0.1: epsilon -= (1.0/exploration_epochs) for i in range(learning_epochs): epsilon = 0.1 print(i, epsilon, gamma, action_repeat, update_frequency, batchSize, buffer) run_bbox(verbose=0, epsilon=epsilon, gamma=gamma, action_repeat=action_repeat, update_frequency=update_frequency, batchSize=batchSize, buffer=buffer, load_weights=load_weights, save_weights=True) load_weights = False else: has_next = 1 # Prepare environment - load the game level prepare_bbox() model.load_weights('_my_model_weights.h5') while has_next: # Get current environment state state = copy.copy(bbox.get_state()) #Run the Q function on S to get predicted reward values on all the possible actions qval = model.predict(state.reshape(1,n_features), batch_size=1) # Choose an action to perform at current step action = (np.argmax(qval)) has_next = bbox.do_action(action) # Finish the game simulation bbox.finish(verbose=1)
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] utility_models = [ SGDRegressor(learning_rate='constant', #penalty='elasticnet', ) for _ in range(n_actions) ] zero_utilities = np.zeros([n_actions]) n_past_act = 1 n_past_st = 0 # in addition to current discount = 0.9 random_steps = 10000 step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() utilities = zero_utilities # Choose action using current utility_models if step > random_steps: clf_state = np.concatenate(states[-n_past_st:] + [state]) \ if n_past_st else state try: utilities = np.array( [m.predict([clf_state])[0] for m in utility_models]) except NotFittedError: pass #utilities -= utilities.min() #p = None if np.isclose(utilities, 0).all() else \ # utilities / utilities.sum() if np.random.rand() < 0.1 or step <= random_steps: action = np.random.choice(n_actions) else: action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) # Train classifiers if len(rewards) >= n_past_act + n_past_st: total_reward = sum(r * np.power(discount, i) for i, r in enumerate(rewards[-n_past_act:])) if n_past_act == 1: clf_state = np.concatenate(states[-(n_past_act + n_past_st):]) else: clf_state = np.concatenate( states[-(n_past_act + n_past_st):-n_past_act + 1]) utility_models[actions[-n_past_act]].partial_fit([clf_state], [total_reward]) if verbose and step % 1000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)
def main(): epsilon = .1 # exploration num_actions = 4 input_size = 36 hidden_size = 24 activation = 'relu' max_memory = 2000 batch_size = 50 mini_epoch = 5 epoch = 10 model = Sequential() model.add( Dense(hidden_size, input_shape=[input_size], activation=activation)) model.add(Dense(hidden_size, activation=activation)) model.add(Dense(num_actions)) model.compile('adam', 'mse') # model.load_weights('model.h5') # Define environment/game bbox.load_level('../levels/train_level.data', verbose=True) # Initialize experience replay object exp_replay = ExperienceReplay(max_memory=max_memory) # FIXME #states = np.fromfile('run_random/states', dtype=np.float32)\ # .reshape([1214494, 36]) #scaler = preprocessing.StandardScaler() #scaler.fit(states) #with open('scaler.pkl', 'wb') as f: # scaler = pickle.dump(scaler, f, protocol=-1) with open('scaler.pkl', 'rb') as f: scaler = pickle.load(f) # Train for e in range(epoch): loss = 0. bbox.reset_level() game_over = False # get initial input get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0] input_t = get_state() score = 0 step = 0 report_steps = 100 while not game_over: step += 1 input_tm1 = input_t # get next action if np.random.rand() <= epsilon: action = np.random.randint(0, num_actions, size=1) else: q = model.predict(np.array([input_tm1]))[0] action = np.argmax(q) # apply action, get rewards and new state game_over = not bbox.do_action(action) input_t = get_state() new_score = bbox.get_score() reward = new_score - score score = new_score # store experience exp_replay.remember([input_tm1, action, reward, input_t], game_over) # adapt model for _ in range(mini_epoch): inputs, targets = exp_replay.get_batch(model, batch_size=batch_size) loss += model.train_on_batch(inputs, targets)[0] if step % report_steps == 0: print('Step {:07d} | Loss {:.4f} | Score {}'.format( step, loss / (report_steps * mini_epoch), score)) loss = 0. print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score)) # Save trained model weights model.save_weights('q_model.h5', overwrite=True)
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.matrix('memory') input_var= T.reshape(input_var,(memtime,n_f+2)) #Score after the agent makes it's choice reality = T.scalar('consequence') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = T.max(lasagne.layers.get_output(agent)) #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(attempt,reality) #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9) #function to do all of the stuff above I DON'T HAVE A TARGET?? train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() scores_per_epoch = np.zeros(epochs) for epoch in range(epochs): e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=0 self_assessment=0 steps=0 trust=0.00 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #upload new state, with no score or action chosen memory[0][:-2] = state if rand.random>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) consequence = bbox.get_score()-consequence #train on choices just made and memory memory[0][-2:]=[action,consequence] train_fn(memory,consequence) #train based on the score change #updating for next loop self_assessment += consequence steps += 1 #occasionally check in on progress if steps%10000==0: trust = trust+.01 score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" self assessment: {}".format(self_assessment)) print (" trust: {}".format(trust)) print (" current score: {}".format(score)) #report on model quality on previous epoch score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Final Score: {}".format(score)) print ("Time to Run: {} minutes".format((time.time()-e_time)/60)) scores_per_epoch[epoch] = score #reset box for next epoch bbox.reset_level() print ("All scores per epoch: ") print (scores_per_epoch) print ("Time to run: {} hours".format((time.time()-start)/3600)) np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def get_action_by_state(state): # return np.random.randint(0, 4) return 0 if __name__ == "__main__": has_next = 1 prepare_bbox() prev_score = bbox.get_score() steps = 0 states = [] while has_next and steps < 100: state = bbox.get_state() states.append(state) v = map(lambda f: "%.2f" % abs(f), state) print " ".join(v) action = get_action_by_state(state) has_next = bbox.do_action(action) score = bbox.get_score() prev_score = score steps += 1 # bbox.finish(verbose=1) print "Total score: %f" % prev_score print "Total steps: %d" % steps img = np.stack(states) img -= img.mean()
def getSensors(self): state = bbox.get_state() print 'state', state return state
def run_bbox(verbose=False, epsilon=0.1, gamma=0.99, action_repeat=4, update_frequency=4, batchSize=32, buffer=100000, load_weights=False, save_weights=False): has_next = 1 # Prepare environment - load the game level prepare_bbox() update_frequency_cntr = 0 replay = [] h=0 if load_weights: model.load_weights('my_model_weights.h5') model_prim.load_weights('my_model_weights.h5') #stores tuples of (S, A, R, S') while has_next: # Get current environment state state = copy.copy(bbox.get_state()) prev_reward = copy.copy(bbox.get_score()) #Run the Q function on S to get predicted reward values on all the possible actions qval = model.predict(state.reshape(1,n_features), batch_size=1) # Choose an action to perform at current step if random.random() < epsilon: #choose random action or best action if random.random() < 0.5: action = np.random.randint(0,n_actions) #assumes 4 different actions else: # Use checkpoints to prime network with good actions action_range=50 #random.randint(1,200) action = calc_best_action_using_checkpoint(action_range=action_range) #for _ in range(action_range): # has_next = bbox.do_action(action) else: #choose best action from Q(s,a) values action = (np.argmax(qval)) # Perform chosen action, observe new state S' # Function do_action(action) returns False if level is finished, otherwise returns True. for a in range(action_repeat): has_next = bbox.do_action(action) new_state = copy.copy(bbox.get_state()) reward = copy.copy(bbox.get_score()) - prev_reward #reward = 1.0 if reward > 0.0 else -1.0 #this gives better than random when combined with a small network #Experience replay storage if (len(replay) < buffer): #if buffer not filled, add to it replay.append((state, action, reward, new_state)) else: #if buffer full, overwrite old values if (h < (buffer-1)): h += 1 else: h = 0 replay[h] = (state, action, reward, new_state) #randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) X_train = [] y_train = [] for memory in minibatch: #Get max_Q(S',a) old_state, action, reward, new_state = memory old_qval = model.predict(old_state.reshape(1,n_features), batch_size=1) newQ = model.predict(new_state.reshape(1,n_features), batch_size=1) maxQ = np.max(newQ) y = np.zeros((1,n_actions)) y[:] = old_qval[:] if has_next == 1: #non-terminal state update = (reward + (gamma * maxQ)) else: #terminal state update = reward y[0][action] = update X_train.append(old_state) y_train.append(y.reshape(n_actions,)) X_train = np.array(X_train) y_train = np.array(y_train) # update the weights of a copy of the network model_prim.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0) if update_frequency_cntr >= update_frequency: prim_weights = model_prim.get_weights() print('model update') model.set_weights(prim_weights) update_frequency_cntr = 0 update_frequency_cntr += 1 if bbox.get_time() % 500000 == 0: print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) # Finish the game simulation, print earned reward and save weights if save_weights: model_prim.save_weights('my_model_weights.h5', overwrite=True) bbox.finish(verbose=1)
def get_state(self): return bbox.get_state()
def learn_bbox(rnet_model, train_data, update_inc=5000, lookup_inc=250, seed_data=False): """ Add training instances to train_data from a single run-through of a bbox session. :param rnet_model: model object with get_lreg_action and get_action methods :param train_data: DataSet object used to buffer states and append new training instances :param update_inc: int, number of steps between each nnet model update :param lookup_inc: int, number of forward action lookup steps :param seed_data: boolean, sets best_action is the action returned by the lreg model. :return: int, the number of action errors, or differences between actions produced by the rnet_model and the ideal or seed model. """ has_next = 1 error_count = 0 rand_count = 0 rand_idx = rand_n prepare_bbox() # For each new state in the session, add it to the data set's state # buffer so that historical states are included in a commit event train_data.clear_buffer() current_state = bbox.get_state() train_data.update_buffer(current_state) while has_next: # If all random values have been used, generate a new batch if rand_idx >= (rand_n - 1): rand_vals = numpy.random.random_sample(size=(rand_n)) rand_idx = 0 step_count = bbox.get_time() # Get the next action from the model based on the current set of # buffered states action = rnet_model.get_action(train_data.get_buffer()) # Every update_inc steps train the model's network with newly # acquired training data if step_count % update_inc == 0: rn_model.run_training(train_data, max_steps=update_nnet, restore=True) error_count = 0 rand_count = 0 # If the random value is less than or equal to the sample # probability, sample the current session state and determine the # best action, adding it to the training set if necessary elif rand_vals[rand_idx] <= sample_prob: if seed_data: best_action = rnet_model.get_lreg_action(current_state) score_delta = 0.1 else: best_action, score_delta = action_lookup( rnet_model, train_data, lookup_inc) if action != best_action: train_data.commit_buffer(best_action, score_delta) error_count += 1 rand_count += 1 # Add random variation to the session by performing a random action # if less than or equal to perturb probability if rand_vals[rand_idx + 1] <= perturb_prob: action = numpy.random.randint(0, 4) step_inc = numpy.random.randint(rand_min, rand_max) for _ in xrange(step_inc): has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) else: has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) rand_idx += 2 if step_count % 5000 == 0: print("time = %d, score = %f" % (step_count, bbox.get_score())) print("errors = %d, samples = %d" % (error_count, rand_count)) #rn_model.print_stats() bbox.finish(verbose=1) return error_count