def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def run_bbox(): f_35_penalty = 0.15; k = 0; w0 = 0.13 bbox.load_level("levels/test_level.data", verbose=0) has_next = True; last_score = 0 act = -1; act_len = 0; crit_len = 150 predict = np.zeros(2); cum_sum = np.zeros(4) while has_next: last_act = act state = bbox.get_state() predict[:2] = np.dot(lr_coefs_1,state[:-1]) + lr_free_coefs_1 if state[35] > 0: cum_sum[1] = predict[0] + k cum_sum[2] = -predict[0] + k elif state[35] < 0: cum_sum[1] = -predict[1] + k cum_sum[2] = predict[1] + k elif state[35] == 0: cum_sum[1] = predict[0] + k cum_sum[2] = predict[1] + k cum_sum[0] = (cum_sum[1]+cum_sum[2])/2 + k cum_sum[1]-=f_35_penalty*state[35] cum_sum[2]+=f_35_penalty*state[35] if act_len > crit_len: cum_sum[last_act]-=0.0078125 act = (w0*(np.dot(lr_coefs_0,state) + lr_free_coefs_0)/6.366 + (1-w0)*cum_sum).argmax() has_next = bbox.do_action(act) if last_act==act: act_len+=1 else: act_len = 0 bbox.finish(verbose=1)
def reset(self): #n = np.random.randint(0, self.grid_size-1, size=1) #m = np.random.randint(1, self.grid_size-2, size=1) if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../../../levels/train_level.data", verbose=1) self.state = bbox.get_state() #np.asarray([0, n, m])[np.newaxis]
def prepare_bbox(): global n_features, n_actions if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/test_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions()
def prepare_bbox(): global n_f, n_a, max_time if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_f = bbox.get_num_of_features() n_a = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def prepare_bbox(train_level=True): """ Load the training level by default, use 'test_level' as a means of validating the bots generalization performance. :param train_level: boolean, load the training level if True :return: None """ if train_level: bbox.load_level("../levels/train_level.data", verbose=1) else: bbox.load_level("../levels/test_level.data", verbose=1)
def prepare_box(): global n_features, n_actions, max_time # Reset the environment to the initial state, just in case if bbox.is_level_loaded(): bbox.reset_level() else: # Load the game level bbox.load_level('levels/train_level.data', verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def prepare_bbox(): global n_features, n_actions, max_time # Reset environment to the initial state, just in case if bbox.is_level_loaded(): bbox.reset_level() else: # Load the game level bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def reset(self): if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level(self.level, verbose=1) self.n_features = bbox.get_num_of_features() self.n_actions = bbox.get_num_of_actions() self.max_time = bbox.get_max_time() self._steps = 0 self._state = np.zeros((1, self.n_features)) self._is_over = False self._prev_score = -float('inf') self._actions_log = []
def prepare_bbox(): global n_features, n_actions, max_time, vectors, pool, num_of_vectors if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() vectors = np.zeros((num_of_vectors, n_features), np.float32) print("preparing") pool = multiprocessing.Pool(processes=processes)
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] with open('utility_models.pkl', 'rb') as f: utility_models = pickle.load(f) step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() action = np.random.choice(n_actions) utilities = [m.predict([state]) for m in utility_models] action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) if verbose and step % 10000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)
def prepare_bbox(): global n_features, n_actions, max_time ## TODO: Save the interactions with the environment as an output data frame global interaction_list interaction_list = [] ## Reset the environment to initial state, just in case if bbox.is_level_loaded(): bbox.reset_level() else: ## Load the game level bbox.load_level("../levels/train_level.data", verbose=True) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() ## The matrix that contains the output data frame states = ['state_'] * n_features state_list = [states[i] + str(i) for i in range(n_features)] header_list = state_list + ['reward', 'action'] interaction_list.append(header_list)
def prepare_bbox(): ''' Prepares the environment (learning/test data). ''' global n_features global n_actions global max_time global q_function global epsilon global gamma global alpha global valid_actions global init_value if bbox.is_level_loaded(): ## Reset the environment to initial state bbox.reset_level() else: ## Load the training/test data bbox.load_level('../levels/train_level.data', verbose=True) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time()
def load_level(self): bbox.load_level(self.level, verbose=0)
#!/usr/bin/env python3 """ A minimal bot player. Loads the level and params and lets the bot act. """ from interface import (get_max_time, get_num_of_actions, get_num_of_features, finish, load_level) from numpy import get_include, load from pyximport import install install(setup_args={'include_dirs': get_include()}, reload_support=True) from bot_wrapper import do_act if __name__ == '__main__': load_level('../levels/train_level.data', verbose=1) level = { 'steps': get_max_time(), 'actions': get_num_of_actions(), 'features': get_num_of_features() } params = dict(load('params.npz')) do_act(level, params) finish(verbose=1)
def main(): epsilon = .1 # exploration num_actions = 4 input_size = 36 hidden_size = 24 activation = 'relu' max_memory = 2000 batch_size = 50 mini_epoch = 5 epoch = 10 model = Sequential() model.add( Dense(hidden_size, input_shape=[input_size], activation=activation)) model.add(Dense(hidden_size, activation=activation)) model.add(Dense(num_actions)) model.compile('adam', 'mse') # model.load_weights('model.h5') # Define environment/game bbox.load_level('../levels/train_level.data', verbose=True) # Initialize experience replay object exp_replay = ExperienceReplay(max_memory=max_memory) # FIXME #states = np.fromfile('run_random/states', dtype=np.float32)\ # .reshape([1214494, 36]) #scaler = preprocessing.StandardScaler() #scaler.fit(states) #with open('scaler.pkl', 'wb') as f: # scaler = pickle.dump(scaler, f, protocol=-1) with open('scaler.pkl', 'rb') as f: scaler = pickle.load(f) # Train for e in range(epoch): loss = 0. bbox.reset_level() game_over = False # get initial input get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0] input_t = get_state() score = 0 step = 0 report_steps = 100 while not game_over: step += 1 input_tm1 = input_t # get next action if np.random.rand() <= epsilon: action = np.random.randint(0, num_actions, size=1) else: q = model.predict(np.array([input_tm1]))[0] action = np.argmax(q) # apply action, get rewards and new state game_over = not bbox.do_action(action) input_t = get_state() new_score = bbox.get_score() reward = new_score - score score = new_score # store experience exp_replay.remember([input_tm1, action, reward, input_t], game_over) # adapt model for _ in range(mini_epoch): inputs, targets = exp_replay.get_batch(model, batch_size=batch_size) loss += model.train_on_batch(inputs, targets)[0] if step % report_steps == 0: print('Step {:07d} | Loss {:.4f} | Score {}'.format( step, loss / (report_steps * mini_epoch), score)) loss = 0. print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score)) # Save trained model weights model.save_weights('q_model.h5', overwrite=True)
def run_bbox(verbose=False): bbox.load_level("../levels/train_level.data", verbose=True) states, actions, scores, rewards = [], [], [], [] utility_models = [ SGDRegressor(learning_rate='constant', #penalty='elasticnet', ) for _ in range(n_actions) ] zero_utilities = np.zeros([n_actions]) n_past_act = 1 n_past_st = 0 # in addition to current discount = 0.9 random_steps = 10000 step = 0 has_next = 1 while has_next: step += 1 state = bbox.get_state() utilities = zero_utilities # Choose action using current utility_models if step > random_steps: clf_state = np.concatenate(states[-n_past_st:] + [state]) \ if n_past_st else state try: utilities = np.array( [m.predict([clf_state])[0] for m in utility_models]) except NotFittedError: pass #utilities -= utilities.min() #p = None if np.isclose(utilities, 0).all() else \ # utilities / utilities.sum() if np.random.rand() < 0.1 or step <= random_steps: action = np.random.choice(n_actions) else: action = np.argmax(utilities) # Do action and bookkeeping has_next = bbox.do_action(action) states.append(np.array(state)) actions.append(action) score = bbox.get_score() rewards.append(score if not scores else (score - scores[-1])) scores.append(score) # Train classifiers if len(rewards) >= n_past_act + n_past_st: total_reward = sum(r * np.power(discount, i) for i, r in enumerate(rewards[-n_past_act:])) if n_past_act == 1: clf_state = np.concatenate(states[-(n_past_act + n_past_st):]) else: clf_state = np.concatenate( states[-(n_past_act + n_past_st):-n_past_act + 1]) utility_models[actions[-n_past_act]].partial_fit([clf_state], [total_reward]) if verbose and step % 1000 == 0: print(step, score) i = 1 get_outdir = 'run_{}'.format outdir = get_outdir(i) while os.path.exists(outdir): i += 1 outdir = get_outdir(i) os.mkdir(outdir) print('saving to {}'.format(outdir)) scores = np.array(scores, dtype=np.float32) scores.tofile(os.path.join(outdir, 'scores')) actions = np.array(actions, dtype=np.int8) actions.tofile(os.path.join(outdir, 'actions')) states = np.array(states, dtype=np.float32) states.tofile(os.path.join(outdir, 'states')) bbox.finish(verbose=True)