def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def reset(self): #n = np.random.randint(0, self.grid_size-1, size=1) #m = np.random.randint(1, self.grid_size-2, size=1) if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../../../levels/train_level.data", verbose=1) self.state = bbox.get_state() #np.asarray([0, n, m])[np.newaxis]
def prepare_bbox(): global n_features, n_actions if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/test_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions()
def is_won(self): #fruit_row, fruit_col, basket = self.state[0] final_score = bbox.get_score() bbox.reset_level() # bbox.finish(verbose=1) self.last_score = 0 self.action_count = 0 return final_score > 0 #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
def prepare_bbox(): global n_f, n_a, max_time if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_f = bbox.get_num_of_features() n_a = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def prepare_bbox(): global n_features, n_actions, max_time # Reset environment to the initial state, just in case if bbox.is_level_loaded(): bbox.reset_level() else: # Load the game level bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def prepare_box(): global n_features, n_actions, max_time # Reset the environment to the initial state, just in case if bbox.is_level_loaded(): bbox.reset_level() else: # Load the game level bbox.load_level('levels/train_level.data', verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def reset(self): if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level(self.level, verbose=1) self.n_features = bbox.get_num_of_features() self.n_actions = bbox.get_num_of_actions() self.max_time = bbox.get_max_time() self._steps = 0 self._state = np.zeros((1, self.n_features)) self._is_over = False self._prev_score = -float('inf') self._actions_log = []
def prepare_bbox(): global n_features, n_actions, max_time, vectors, pool, num_of_vectors if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() vectors = np.zeros((num_of_vectors, n_features), np.float32) print("preparing") pool = multiprocessing.Pool(processes=processes)
def prepare_bbox(): global n_features, n_actions, max_time ## TODO: Save the interactions with the environment as an output data frame global interaction_list interaction_list = [] ## Reset the environment to initial state, just in case if bbox.is_level_loaded(): bbox.reset_level() else: ## Load the game level bbox.load_level("../levels/train_level.data", verbose=True) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() ## The matrix that contains the output data frame states = ['state_'] * n_features state_list = [states[i] + str(i) for i in range(n_features)] header_list = state_list + ['reward', 'action'] interaction_list.append(header_list)
def prepare_bbox(): ''' Prepares the environment (learning/test data). ''' global n_features global n_actions global max_time global q_function global epsilon global gamma global alpha global valid_actions global init_value if bbox.is_level_loaded(): ## Reset the environment to initial state bbox.reset_level() else: ## Load the training/test data bbox.load_level('../levels/train_level.data', verbose=True) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.tensor3('memory') input_var= T.reshape(input_var,(memtime,1,n_f+2)) #Score after the agent makes it's choice reality = T.vector('score_diffs') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks their best choice is this event evaluation = lasagne.layers.get_output(agent)[0] #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(evaluation,reality) reward = reward.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9) #A function to get the agent's choice of what to try this time decide_fn = theano.function([input_var],evaluation) #function to do all of the stuff above train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run start = time.time() for epoch in range(epochs): memory = np.zeros(shape=(memtime,1,n_f+2)) e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=error=0 steps=0 trust=0.00+.02*epoch good=0 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #get best action based on 100 step checkpoint method actuals = get_all_score_diffs(state) #upload new state, with no score or action chosen memory[0][0][:-2] = state if rand.random()>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = decide_fn(memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best if action == np.argmax(actuals): good = good+1 #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) #find consequenquence score = bbox.get_score() consequence=score-consequence #train on choices just made and memory memory[0][0][-2:]=[action,consequence] error += train_fn(memory,actuals) #train based on the score change #updating for next loop steps += 1 #occasionally check in on progress if steps%10000==0: score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" current trust: {}".format(trust)) print (" avg error: {}".format(error/steps)) print (" bad choices: {}%".format(100-float(good)/100)) print (" current score: {}".format(score)) if trust<.95: trust = trust+.02 bbox.clear_all_checkpoints() ch=ra=good=0 #report on model quality on previous epoch score = bbox.get_score() with open("epoch_data.txt","a") as f: f.write("Epoch: {} Final Score: {} Average Error: {} Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60)) #save model parameters np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent)) #reset box for next epoch if(epoch<epochs-1): bbox.reset_level() print ("Time to run: {} hours".format((time.time()-start)/3600)) bbox.finish(verbose=1)
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.matrix('memory') input_var= T.reshape(input_var,(memtime,n_f+2)) #Score after the agent makes it's choice reality = T.scalar('consequence') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = T.max(lasagne.layers.get_output(agent)) #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(attempt,reality) #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9) #function to do all of the stuff above I DON'T HAVE A TARGET?? train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() scores_per_epoch = np.zeros(epochs) for epoch in range(epochs): e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=0 self_assessment=0 steps=0 trust=0.00 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #upload new state, with no score or action chosen memory[0][:-2] = state if rand.random>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) consequence = bbox.get_score()-consequence #train on choices just made and memory memory[0][-2:]=[action,consequence] train_fn(memory,consequence) #train based on the score change #updating for next loop self_assessment += consequence steps += 1 #occasionally check in on progress if steps%10000==0: trust = trust+.01 score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" self assessment: {}".format(self_assessment)) print (" trust: {}".format(trust)) print (" current score: {}".format(score)) #report on model quality on previous epoch score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Final Score: {}".format(score)) print ("Time to Run: {} minutes".format((time.time()-e_time)/60)) scores_per_epoch[epoch] = score #reset box for next epoch bbox.reset_level() print ("All scores per epoch: ") print (scores_per_epoch) print ("Time to run: {} hours".format((time.time()-start)/3600)) np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def main(): epsilon = .1 # exploration num_actions = 4 input_size = 36 hidden_size = 24 activation = 'relu' max_memory = 2000 batch_size = 50 mini_epoch = 5 epoch = 10 model = Sequential() model.add( Dense(hidden_size, input_shape=[input_size], activation=activation)) model.add(Dense(hidden_size, activation=activation)) model.add(Dense(num_actions)) model.compile('adam', 'mse') # model.load_weights('model.h5') # Define environment/game bbox.load_level('../levels/train_level.data', verbose=True) # Initialize experience replay object exp_replay = ExperienceReplay(max_memory=max_memory) # FIXME #states = np.fromfile('run_random/states', dtype=np.float32)\ # .reshape([1214494, 36]) #scaler = preprocessing.StandardScaler() #scaler.fit(states) #with open('scaler.pkl', 'wb') as f: # scaler = pickle.dump(scaler, f, protocol=-1) with open('scaler.pkl', 'rb') as f: scaler = pickle.load(f) # Train for e in range(epoch): loss = 0. bbox.reset_level() game_over = False # get initial input get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0] input_t = get_state() score = 0 step = 0 report_steps = 100 while not game_over: step += 1 input_tm1 = input_t # get next action if np.random.rand() <= epsilon: action = np.random.randint(0, num_actions, size=1) else: q = model.predict(np.array([input_tm1]))[0] action = np.argmax(q) # apply action, get rewards and new state game_over = not bbox.do_action(action) input_t = get_state() new_score = bbox.get_score() reward = new_score - score score = new_score # store experience exp_replay.remember([input_tm1, action, reward, input_t], game_over) # adapt model for _ in range(mini_epoch): inputs, targets = exp_replay.get_batch(model, batch_size=batch_size) loss += model.train_on_batch(inputs, targets)[0] if step % report_steps == 0: print('Step {:07d} | Loss {:.4f} | Score {}'.format( step, loss / (report_steps * mini_epoch), score)) loss = 0. print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score)) # Save trained model weights model.save_weights('q_model.h5', overwrite=True)