def run_bbox(verbose=False): ''' Runs the Blackbox challenge. ''' has_next = True prepare_bbox() while has_next: ## Observe the current state variables state = bbox.get_state() state_tuple = get_state_tuple(state) ## Select the current action action = get_action(state_tuple, verbose=verbose, is_current=True) ## Get the current reward reward = bbox.get_score() print 'Reward = ' + str(reward) ## Retrieve the current Q-value current_q = q_function[state_tuple][action] print 'Current Q = ' + str(current_q) ## Observe the next state (assuming there always is) has_next = bbox.do_action(action) next_state = bbox.get_state() next_state_tuple = get_state_tuple(next_state) ## Get the best q_action in the new state next_action = get_action(next_state_tuple, verbose=verbose, is_current=False) ## Get the new Q_value next_q = q_function[next_state_tuple][next_action] ## Update the Q-function q_function[state_tuple][action] = (1 - alpha) * current_q + alpha * (reward + gamma * next_q) print 'Updated Q = ' + str(q_function[state_tuple][action]) bbox.finish(verbose=True)
def do_play(bot, params, levels, runs, prngs_seed, verbosity, **kwargs): """ Evaluates the bot with params on some levels. """ common_printoptions() prngs_seed = seed_prngs(prngs_seed) start = clock() bot_class = available_bots[bot] params_key, params = load_params(bot, params, verbosity)[:2] scores = odict() for level in levels: level = load_level(level, verbosity) scores[level['key']] = bot_class(level, params).evaluate(runs) finish(verbose=verbosity > 3) end = clock() return {'date': datetime.utcnow(), 'bot': bot, 'params_key': params_key, 'levels': levels, 'runs': runs, 'scores': scores, 'time': end - start, 'prngs_seed': prngs_seed}
def run_bbox(verbose=False): has_next = 1 prepare_bbox() #vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(1,n_features)) #Load net into the agent object agent=prepare_agent(input_var) attempt = lasagne.layers.get_output(agent) #function to do all of the stuff above eval_fn = theano.function([input_var], attempt,on_unused_input='ignore') #time to check how long it takes to run start = time.time() error=0 steps=0 while has_next: state = bbox.get_state() r_state= np.reshape(state,(1,n_features)) attempt = eval_fn(r_state) action = np.argmax(attempt) steps+=1 if steps%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" training loss: {}".format(error/steps)) print (" current score: {}".format(score)) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) print ("{} steps total".format(steps)) np.savez('model.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def do_play(bot, params, levels, runs, prngs_seed, verbosity, **kwargs): """ Evaluates the bot with params on some levels. """ common_printoptions() prngs_seed = seed_prngs(prngs_seed) start = clock() bot_class = available_bots[bot] params_key, params = load_params(bot, params, verbosity)[:2] scores = odict() for level in levels: level = load_level(level, verbosity) scores[level['key']] = bot_class(level, params).evaluate(runs) finish(verbose=verbosity > 3) end = clock() return { 'date': datetime.utcnow(), 'bot': bot, 'params_key': params_key, 'levels': levels, 'runs': runs, 'scores': scores, 'time': end - start, 'prngs_seed': prngs_seed }
def run_bbox(verbose=False): ''' Runs the Blackbox challenge. ''' has_next = True ## Prepare the environment -- load the game level prepare_bbox() while has_next: ## Get the current environment state vector state = bbox.get_state() ## Choose an action to perform at the current state action = get_action_by_state(state, verbose=verbose) ## Function do_action(action) returns False if the level ## is finished; otherwise, it returns True has_next = bbox.do_action(action) ## Save the interactions as an output CSV file headers = interaction_list.pop(0) interaction_df = pd.DataFrame(interaction_list, columns=headers) datetime_int = int(calendar.timegm(time.gmtime())) out_filename = '../output/interaction_' + str(datetime_int) + '.csv' interaction_df.to_csv(out_filename, index=False) print 'Saved to file: ' + out_filename ## When submitting solution, make sure to call finish(), which returns the sum of points obtained ## during the entire simulation. This number is used as the public leader board score bbox.finish(verbose=True)
def run_bbox(): f_35_penalty = 0.15; k = 0; w0 = 0.13 bbox.load_level("levels/test_level.data", verbose=0) has_next = True; last_score = 0 act = -1; act_len = 0; crit_len = 150 predict = np.zeros(2); cum_sum = np.zeros(4) while has_next: last_act = act state = bbox.get_state() predict[:2] = np.dot(lr_coefs_1,state[:-1]) + lr_free_coefs_1 if state[35] > 0: cum_sum[1] = predict[0] + k cum_sum[2] = -predict[0] + k elif state[35] < 0: cum_sum[1] = -predict[1] + k cum_sum[2] = predict[1] + k elif state[35] == 0: cum_sum[1] = predict[0] + k cum_sum[2] = predict[1] + k cum_sum[0] = (cum_sum[1]+cum_sum[2])/2 + k cum_sum[1]-=f_35_penalty*state[35] cum_sum[2]+=f_35_penalty*state[35] if act_len > crit_len: cum_sum[last_act]-=0.0078125 act = (w0*(np.dot(lr_coefs_0,state) + lr_free_coefs_0)/6.366 + (1-w0)*cum_sum).argmax() has_next = bbox.do_action(act) if last_act==act: act_len+=1 else: act_len = 0 bbox.finish(verbose=1)
def test_bot(bot, level, make_features): env = BBox(level) while env.has_next: if env.get_time() % 10000 == 0: print str(env.get_time()) + "\t" + str(env.get_score()) action = bot.get_action(make_features(env)) env.do_action(action) bbox.finish() print bbox.get_score()
def run_bbox(): has_next = 1 prepare_bbox() load_regression_coefs("reg_coefs.txt") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) bbox.finish(verbose=1)
def run_bbox(verbose=False): has_next = 1 prepare_bbox() while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) bbox.finish(verbose=1)
def run_bbox(): global ensamble has_next = 1 prepare_bbox() ensamble=Ensemble.NN_Ensemble(n_features,4,[[36,64,4],[16,4],[16,4],[36,64,4]],n_actions) ensamble.read_weights("weights") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) if(bbox.get_time()%10000==0): print(str(bbox.get_time())+" "+str(bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): has_next = 1 prepare_bbox() while has_next: best_act = calc_best_action_using_checkpoint() for _ in range(100): has_next = bbox.do_action(best_act) if bbox.get_time() % 10000 == 0: print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): global ensamble has_next = 1 prepare_bbox() ensamble = Ensemble.NN_Ensemble( n_features, 4, [[36, 64, 4], [16, 4], [16, 4], [36, 64, 4]], n_actions) ensamble.read_weights("weights") while has_next: state = bbox.get_state() action = get_action_by_state(state) has_next = bbox.do_action(action) if (bbox.get_time() % 10000 == 0): print(str(bbox.get_time()) + " " + str(bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(): has_next = 1 prepare_bbox() while has_next: best_act = calc_best_action_using_checkpoint() for _ in range(100): has_next = bbox.do_action(best_act) if bbox.get_time() % 10000 == 0: print("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) bbox.finish(verbose=1)
def run_bbox(rnet_model, train_data, train_level=True, verbose=True): """ Run a single session of the black box training or test environments :param rnet_model: model with a get_action(state) method :param train_data: a DataSet object used to buffer each state :param train_level: boolean, run the training level if True :param verbose: boolean, display additional information if True :return: float, the final session score """ has_next = 1 prepare_bbox(train_level) train_data.clear_buffer() while has_next: step_count = bbox.get_time() train_data.update_buffer(bbox.get_state()) state = train_data.get_buffer() action = rnet_model.get_action(state) has_next = bbox.do_action(action) if step_count % 5000 == 0 and verbose: print ("time = %d, score = %f" % (step_count, bbox.get_score())) final_score = bbox.finish(verbose=1) return final_score
def run_bbox(rnet_model, train_data, train_level=True, verbose=True): """ Run a single session of the black box training or test environments :param rnet_model: model with a get_action(state) method :param train_data: a DataSet object used to buffer each state :param train_level: boolean, run the training level if True :param verbose: boolean, display additional information if True :return: float, the final session score """ has_next = 1 prepare_bbox(train_level) train_data.clear_buffer() while has_next: step_count = bbox.get_time() train_data.update_buffer(bbox.get_state()) state = train_data.get_buffer() action = rnet_model.get_action(state) has_next = bbox.do_action(action) if step_count % 5000 == 0 and verbose: print("time = %d, score = %f" % (step_count, bbox.get_score())) final_score = bbox.finish(verbose=1) return final_score
def run_bbox(): start_time = time.time() has_next = 1 prepare_bbox() coefs = load_regression_coefs("star 13-best_coefs_score=2980.401123046875_sigma=0.0010000000474974513_level=train_level.txt") state = np.ones(n_features + 1) while has_next: state[:-1] = bbox.get_state() action = get_action_by_state(state, coefs) has_next = bbox.do_action(action) bbox.finish(verbose=1) end_time = time.time() print(end_time - start_time)
def run_bbox(verbose=False): has_next = 1 prepare_bbox() # vector of the current state features input_var= T.matrix('in_state') input_var= T.reshape(input_var,(1000,n_features)) #vector of the scores for 100 of the same action target_var = T.matrix('scores') target_var = T.reshape(target_var,(1000,n_actions)) #Load net into the agent object agent=prepare_agent(input_var) #what the agent thinks will happen if it does each action 100 times attempt = lasagne.layers.get_output(agent) #how much the agent was wrong, and should be punished punish = lasagne.objectives.squared_error(attempt,target_var) punish = punish.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=0.001,momentum=0.9) #function to do all of the stuff above train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore') # time to check how long it takes to run start = time.time() states, scores, loops = load_dataset('Full.txt') for n in range(loops): error=0 steps=0 ins = states[n:n+15] out = scores[n:n+15] action = np.argmax(out[0]) error = train_fn(ins,out) if n%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" training loss: {}".format(error)) print (" current score: {}".format(score)) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) np.savez('model.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def run_bbox(verbose=False): has_next = 1 # Prepare environment - load the game level prepare_bbox() while has_next: # Get current environment state state = bbox.get_state() # Choose an action to perform at current step action = get_action_by_state(state) # Perform chosen action # Function do_action(action) returns False if level is finished, otherwise returns True. has_next = bbox.do_action(action) # Finish the game simulation, print earned reward # While submitting solutions, make sure that you do call finish() bbox.finish(verbose=1)
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] with open('utility_models.pkl', 'rb') as f: utility_models = pickle.load(f) step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() action = np.random.choice(n_actions) utilities = [m.predict([state]) for m in utility_models] action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) if verbose and step % 10000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)
def run_bbox(verbose=False): has_next = 1 # Prepare environment - Load the game level prepare_box() while has_next: # Get current environment state state = bbox.get_state() # Choose an action to perform at current step action = get_action_by_state(state) # Perform chosen action # Function do_action(action) returns False if level is finished, # Otherwise returns True has_next = bbox.do_action(action) # Finish the game simulation, print earned reward # While submitting solutions make sure you do call finish() bbox.finish(verbose=1)
def run_bbox(verbose=False): has_next = 1 prepare_bbox() # vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(memtime,n_f+2)) #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = lasagne.layers.get_output(agent)[0] #function to do all of the stuff above test_fn = theano.function([input_var], attempt) # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() consequence=0 steps=0 while has_next: memory = forget(memory) state = bbox.get_state() memory[0][:-2]=state choices = test_fn(memory) action = np.argmax(choices) has_next = bbox.do_action(action) score = bbox.get_score() consequence=score-consequence memory[0][-2:] = [action,consequence] steps+=1 if steps%10000==0: score = bbox.get_score() print ("Steps: {}".format(steps)) print (" current score: {}".format(score)) print ("Final Score: {}".format(score)) print ("Time to run: {} seconds".format(time.time()-start)) bbox.finish(verbose=1)
def run_bbox(verbose=False): has_next = 1 prepare_bbox() #vector of the current state features input_var= T.dvector('in_state') input_var= T.reshape(input_var,(1,n_features)) #vector of the scores for 100 of the same action target_var = T.dvector('scores') target_var = T.reshape(target_var,(1,n_actions)) #Load net into the agent object agent=prepare_agent(input_var) #what the agent thinks will happen if it does each action 100 times attempt = lasagne.layers.get_output(agent) #how much the agent was wrong, and should be punished punish = lasagne.objectives.squared_error(attempt,target_var) punish = punish.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=.1,momentum=.9) #function to do all of the stuff above train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore') #time to check how long it takes to run start = time.time() while has_next: state = bbox.get_state() r_state= np.reshape(state,(1,n_features)) scores = get_all_scores(state) r_scores = np.reshape(scores,(1,n_actions)) action = T.argmax(scores) error = train_fn(r_state,r_scores) print (error) has_next = bbox.do_action(action) print ("Time to run: {} seconds".format(time.time()-start)) bbox.finish(verbose=1)
def learn_bbox(rnet_model, train_data, update_inc=5000, lookup_inc=250, seed_data=False): """ Add training instances to train_data from a single run-through of a bbox session. :param rnet_model: model object with get_lreg_action and get_action methods :param train_data: DataSet object used to buffer states and append new training instances :param update_inc: int, number of steps between each nnet model update :param lookup_inc: int, number of forward action lookup steps :param seed_data: boolean, sets best_action is the action returned by the lreg model. :return: int, the number of action errors, or differences between actions produced by the rnet_model and the ideal or seed model. """ has_next = 1 error_count = 0 rand_count = 0 rand_idx = rand_n prepare_bbox() # For each new state in the session, add it to the data set's state # buffer so that historical states are included in a commit event train_data.clear_buffer() current_state = bbox.get_state() train_data.update_buffer(current_state) while has_next: # If all random values have been used, generate a new batch if rand_idx >= (rand_n-1): rand_vals = numpy.random.random_sample(size=(rand_n)) rand_idx = 0 step_count = bbox.get_time() # Get the next action from the model based on the current set of # buffered states action = rnet_model.get_action(train_data.get_buffer()) # Every update_inc steps train the model's network with newly # acquired training data if step_count % update_inc == 0: rn_model.run_training(train_data, max_steps=update_nnet, restore=True) error_count = 0 rand_count = 0 # If the random value is less than or equal to the sample # probability, sample the current session state and determine the # best action, adding it to the training set if necessary elif rand_vals[rand_idx] <= sample_prob: if seed_data: best_action = rnet_model.get_lreg_action(current_state) score_delta = 0.1 else: best_action, score_delta = action_lookup(rnet_model, train_data, lookup_inc) if action != best_action: train_data.commit_buffer(best_action, score_delta) error_count += 1 rand_count += 1 # Add random variation to the session by performing a random action # if less than or equal to perturb probability if rand_vals[rand_idx+1] <= perturb_prob: action = numpy.random.randint(0,4) step_inc = numpy.random.randint(rand_min, rand_max) for _ in xrange(step_inc): has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) else: has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) rand_idx += 2 if step_count % 5000 == 0: print ("time = %d, score = %f" % (step_count, bbox.get_score())) print ("errors = %d, samples = %d" % (error_count, rand_count)) #rn_model.print_stats() bbox.finish(verbose=1) return error_count
def finish(): """Exits""" ci.finish() exit()
#!/usr/bin/env python3 """ A minimal bot player. Loads the level and params and lets the bot act. """ from interface import (get_max_time, get_num_of_actions, get_num_of_features, finish, load_level) from numpy import get_include, load from pyximport import install install(setup_args={'include_dirs': get_include()}, reload_support=True) from bot_wrapper import do_act if __name__ == '__main__': load_level('../levels/train_level.data', verbose=1) level = { 'steps': get_max_time(), 'actions': get_num_of_actions(), 'features': get_num_of_features() } params = dict(load('params.npz')) do_act(level, params) finish(verbose=1)
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.matrix('memory') input_var= T.reshape(input_var,(memtime,n_f+2)) #Score after the agent makes it's choice reality = T.scalar('consequence') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks the best choice will be attempt = T.max(lasagne.layers.get_output(agent)) #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(attempt,reality) #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9) #function to do all of the stuff above I DON'T HAVE A TARGET?? train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run memory = np.zeros(shape=(memtime,n_f+2)) start = time.time() scores_per_epoch = np.zeros(epochs) for epoch in range(epochs): e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=0 self_assessment=0 steps=0 trust=0.00 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #upload new state, with no score or action chosen memory[0][:-2] = state if rand.random>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) consequence = bbox.get_score()-consequence #train on choices just made and memory memory[0][-2:]=[action,consequence] train_fn(memory,consequence) #train based on the score change #updating for next loop self_assessment += consequence steps += 1 #occasionally check in on progress if steps%10000==0: trust = trust+.01 score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" self assessment: {}".format(self_assessment)) print (" trust: {}".format(trust)) print (" current score: {}".format(score)) #report on model quality on previous epoch score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Final Score: {}".format(score)) print ("Time to Run: {} minutes".format((time.time()-e_time)/60)) scores_per_epoch[epoch] = score #reset box for next epoch bbox.reset_level() print ("All scores per epoch: ") print (scores_per_epoch) print ("Time to run: {} hours".format((time.time()-start)/3600)) np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent)) bbox.finish(verbose=1)
def finish(self, verbose): bbox.finish(verbose=verbose)
keras_model.add(Dropout(0.5)) keras_model.add(Dense(agent_env.n_actions, activation="softmax")) agent_model = KerasModel(keras_model) # experience memory agent_mem = ExperienceReplay(memory_length=memory_len) # compile agent agent = DiscreteAgent(agent_model, agent_mem, epsilon=lambda *args: epsilon) # SGD optimizer + MSE cost + MAX policy = Q-learning as we know it #agent.compile(optimizer=RMSprop(lr=0.001), loss='mse', policy_rule='max') agent.compile(optimizer=RMSprop(lr=0.001), loss='categorical_crossentropy', policy_rule='max') # train agent agent.learn(agent_env, epoch=epochs, batch_size=batch_size, gamma=gamma) # save trained model and weights pre = "model-04-slow" with open(pre + ".json", 'w') as f: json.dump(keras_model.to_json(), f) keras_model.save_weights(pre + ".h5", overwrite=True) # test agent #agent.play(agent_env, epoch=100) bbox.finish(verbose=1)
def finish(): interface.finish()
if np.random.rand() <= epsilon: action = np.random.randint(0, num_actions, size=1)[0] else: q = model.predict(input_tm1) action = np.argmax(q[0]) # apply action, get rewards and new state input_t, reward, game_over = env.act(action) if reward >= 0.: win_cnt += 1 # store experience exp_replay.remember([input_tm1, action, reward, input_t], game_over) # adapt model inputs, targets = exp_replay.get_batch(model, batch_size=batch_size) loss += model.train_on_batch(inputs, targets)[0] print("Epoch {:03d}/999 | Loss {:.4f} | Win count {}".format(e, loss, win_cnt)) # save trained model and weights with open("model2.json", 'w') as f: json.dump(keras_model.to_json(), f) keras_model.save_weights("model2.h5", overwrite=True) # test agent #agent.play(agent_env, epoch=100) bbox.finish(verbose=1)
def learn_bbox(rnet_model, train_data, update_inc=5000, lookup_inc=250, seed_data=False): """ Add training instances to train_data from a single run-through of a bbox session. :param rnet_model: model object with get_lreg_action and get_action methods :param train_data: DataSet object used to buffer states and append new training instances :param update_inc: int, number of steps between each nnet model update :param lookup_inc: int, number of forward action lookup steps :param seed_data: boolean, sets best_action is the action returned by the lreg model. :return: int, the number of action errors, or differences between actions produced by the rnet_model and the ideal or seed model. """ has_next = 1 error_count = 0 rand_count = 0 rand_idx = rand_n prepare_bbox() # For each new state in the session, add it to the data set's state # buffer so that historical states are included in a commit event train_data.clear_buffer() current_state = bbox.get_state() train_data.update_buffer(current_state) while has_next: # If all random values have been used, generate a new batch if rand_idx >= (rand_n - 1): rand_vals = numpy.random.random_sample(size=(rand_n)) rand_idx = 0 step_count = bbox.get_time() # Get the next action from the model based on the current set of # buffered states action = rnet_model.get_action(train_data.get_buffer()) # Every update_inc steps train the model's network with newly # acquired training data if step_count % update_inc == 0: rn_model.run_training(train_data, max_steps=update_nnet, restore=True) error_count = 0 rand_count = 0 # If the random value is less than or equal to the sample # probability, sample the current session state and determine the # best action, adding it to the training set if necessary elif rand_vals[rand_idx] <= sample_prob: if seed_data: best_action = rnet_model.get_lreg_action(current_state) score_delta = 0.1 else: best_action, score_delta = action_lookup( rnet_model, train_data, lookup_inc) if action != best_action: train_data.commit_buffer(best_action, score_delta) error_count += 1 rand_count += 1 # Add random variation to the session by performing a random action # if less than or equal to perturb probability if rand_vals[rand_idx + 1] <= perturb_prob: action = numpy.random.randint(0, 4) step_inc = numpy.random.randint(rand_min, rand_max) for _ in xrange(step_inc): has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) else: has_next = bbox.do_action(action) current_state = bbox.get_state() train_data.update_buffer(current_state) rand_idx += 2 if step_count % 5000 == 0: print("time = %d, score = %f" % (step_count, bbox.get_score())) print("errors = %d, samples = %d" % (error_count, rand_count)) #rn_model.print_stats() bbox.finish(verbose=1) return error_count
def run_bbox(verbose=False): prepare_bbox() # vector of the current state features input_var= T.tensor3('memory') input_var= T.reshape(input_var,(memtime,1,n_f+2)) #Score after the agent makes it's choice reality = T.vector('score_diffs') #Load net into the agent object agent=prepare_agent(input_var) #What the agent thinks their best choice is this event evaluation = lasagne.layers.get_output(agent)[0] #how much the agent should be rewarded/punished reward = lasagne.objectives.squared_error(evaluation,reality) reward = reward.mean() #get the parameters for updating params = lasagne.layers.get_all_params(agent,trainable=True) #update the net with the error teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9) #A function to get the agent's choice of what to try this time decide_fn = theano.function([input_var],evaluation) #function to do all of the stuff above train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore') # time to check how long it takes to run start = time.time() for epoch in range(epochs): memory = np.zeros(shape=(memtime,1,n_f+2)) e_time = time.time() #time for this epoch has_next = 1 #looping variable, state of bbox #initialize tracking variables consequence=error=0 steps=0 trust=0.00+.02*epoch good=0 while has_next: #Updating memory matrix, forgetting a state, making room memory = forget(memory) state = bbox.get_state() #get best action based on 100 step checkpoint method actuals = get_all_score_diffs(state) #upload new state, with no score or action chosen memory[0][0][:-2] = state if rand.random()>trust: action = rand.randint(0,n_a-1) #if trust is too low still, random action else: choices = decide_fn(memory) #Otherwise, let the agent decide. action = np.argmax(choices) #pick action agent thinks is best if action == np.argmax(actuals): good = good+1 #do it, and find out the consequences (if the score improved or went down) has_next = bbox.do_action(action) #find consequenquence score = bbox.get_score() consequence=score-consequence #train on choices just made and memory memory[0][0][-2:]=[action,consequence] error += train_fn(memory,actuals) #train based on the score change #updating for next loop steps += 1 #occasionally check in on progress if steps%10000==0: score = bbox.get_score() print ("Epoch: {}".format(epoch)) print ("Steps: {}".format(steps)) print (" current trust: {}".format(trust)) print (" avg error: {}".format(error/steps)) print (" bad choices: {}%".format(100-float(good)/100)) print (" current score: {}".format(score)) if trust<.95: trust = trust+.02 bbox.clear_all_checkpoints() ch=ra=good=0 #report on model quality on previous epoch score = bbox.get_score() with open("epoch_data.txt","a") as f: f.write("Epoch: {} Final Score: {} Average Error: {} Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60)) #save model parameters np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent)) #reset box for next epoch if(epoch<epochs-1): bbox.reset_level() print ("Time to run: {} hours".format((time.time()-start)/3600)) bbox.finish(verbose=1)
def run_bbox(verbose=False, epsilon=0.1, gamma=0.99, action_repeat=4, update_frequency=4, batchSize=32, buffer=100000, load_weights=False, save_weights=False): has_next = 1 # Prepare environment - load the game level prepare_bbox() update_frequency_cntr = 0 replay = [] h=0 if load_weights: model.load_weights('my_model_weights.h5') model_prim.load_weights('my_model_weights.h5') #stores tuples of (S, A, R, S') while has_next: # Get current environment state state = copy.copy(bbox.get_state()) prev_reward = copy.copy(bbox.get_score()) #Run the Q function on S to get predicted reward values on all the possible actions qval = model.predict(state.reshape(1,n_features), batch_size=1) # Choose an action to perform at current step if random.random() < epsilon: #choose random action or best action if random.random() < 0.5: action = np.random.randint(0,n_actions) #assumes 4 different actions else: # Use checkpoints to prime network with good actions action_range=50 #random.randint(1,200) action = calc_best_action_using_checkpoint(action_range=action_range) #for _ in range(action_range): # has_next = bbox.do_action(action) else: #choose best action from Q(s,a) values action = (np.argmax(qval)) # Perform chosen action, observe new state S' # Function do_action(action) returns False if level is finished, otherwise returns True. for a in range(action_repeat): has_next = bbox.do_action(action) new_state = copy.copy(bbox.get_state()) reward = copy.copy(bbox.get_score()) - prev_reward #reward = 1.0 if reward > 0.0 else -1.0 #this gives better than random when combined with a small network #Experience replay storage if (len(replay) < buffer): #if buffer not filled, add to it replay.append((state, action, reward, new_state)) else: #if buffer full, overwrite old values if (h < (buffer-1)): h += 1 else: h = 0 replay[h] = (state, action, reward, new_state) #randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) X_train = [] y_train = [] for memory in minibatch: #Get max_Q(S',a) old_state, action, reward, new_state = memory old_qval = model.predict(old_state.reshape(1,n_features), batch_size=1) newQ = model.predict(new_state.reshape(1,n_features), batch_size=1) maxQ = np.max(newQ) y = np.zeros((1,n_actions)) y[:] = old_qval[:] if has_next == 1: #non-terminal state update = (reward + (gamma * maxQ)) else: #terminal state update = reward y[0][action] = update X_train.append(old_state) y_train.append(y.reshape(n_actions,)) X_train = np.array(X_train) y_train = np.array(y_train) # update the weights of a copy of the network model_prim.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0) if update_frequency_cntr >= update_frequency: prim_weights = model_prim.get_weights() print('model update') model.set_weights(prim_weights) update_frequency_cntr = 0 update_frequency_cntr += 1 if bbox.get_time() % 500000 == 0: print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score())) # Finish the game simulation, print earned reward and save weights if save_weights: model_prim.save_weights('my_model_weights.h5', overwrite=True) bbox.finish(verbose=1)
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] utility_models = [ SGDRegressor(learning_rate='constant', #penalty='elasticnet', ) for _ in range(n_actions) ] zero_utilities = np.zeros([n_actions]) n_past_act = 1 n_past_st = 0 # in addition to current discount = 0.9 random_steps = 10000 step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() utilities = zero_utilities # Choose action using current utility_models if step > random_steps: clf_state = np.concatenate(states[-n_past_st:] + [state]) \ if n_past_st else state try: utilities = np.array( [m.predict([clf_state])[0] for m in utility_models]) except NotFittedError: pass #utilities -= utilities.min() #p = None if np.isclose(utilities, 0).all() else \ # utilities / utilities.sum() if np.random.rand() < 0.1 or step <= random_steps: action = np.random.choice(n_actions) else: action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) # Train classifiers if len(rewards) >= n_past_act + n_past_st: total_reward = sum(r * np.power(discount, i) for i, r in enumerate(rewards[-n_past_act:])) if n_past_act == 1: clf_state = np.concatenate(states[-(n_past_act + n_past_st):]) else: clf_state = np.concatenate( states[-(n_past_act + n_past_st):-n_past_act + 1]) utility_models[actions[-n_past_act]].partial_fit([clf_state], [total_reward]) if verbose and step % 1000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)