コード例 #1
0
def run_bbox(verbose=False):
    '''
    Runs the Blackbox challenge.
    '''
    
    has_next = True
    prepare_bbox()
    
    while has_next:
        ## Observe the current state variables
        state = bbox.get_state()
        state_tuple = get_state_tuple(state)
        ## Select the current action
        action = get_action(state_tuple, verbose=verbose, is_current=True)
        ## Get the current reward
        reward = bbox.get_score()
        print 'Reward = ' + str(reward)
        
        ## Retrieve the current Q-value
        current_q = q_function[state_tuple][action]
        print 'Current Q = ' + str(current_q)
        
        ## Observe the next state (assuming there always is)
        has_next = bbox.do_action(action)
        next_state = bbox.get_state()
        next_state_tuple = get_state_tuple(next_state)
        ## Get the best q_action in the new state
        next_action = get_action(next_state_tuple, verbose=verbose, is_current=False)    
        ## Get the new Q_value
        next_q = q_function[next_state_tuple][next_action]
        ## Update the Q-function
        q_function[state_tuple][action] = (1 - alpha) * current_q + alpha * (reward + gamma * next_q)
        print 'Updated Q = ' + str(q_function[state_tuple][action])
    
    bbox.finish(verbose=True)
コード例 #2
0
 def get_state(self):
     #im_size = (self.grid_size,) * 2
     #state = self.state[0]
     #canvas = np.zeros(im_size)
     #canvas[state[0], state[1]] = 1
     #canvas[-1, state[2]-1:state[2] + 2] = 1
     return bbox.get_state()  #canvas
コード例 #3
0
def eval_game(game: Game, dqn: DQN, action, q_vals, queue, root_index, root=True):
    """
    Called by look_ahead function. Used to evaluate a state, update Q value,
    enumerate and enqueue possible child actions.
    By default, this treats the root actions first.
    Args:
        game, A Game object to be evaluated
        dqn, A deep Q learning network object to evaluate state
        action, A tuple representing an action and its optional target
        q_vals, A shared mem array for the global Q vales
        queue, A Queue to store child actions
        root_index, The index of the root action in q_vals
        root(=True), Whether or not these are the root actions
    Returns:
        self
    """

    #   (local) copy game object, perform action, get state feature vector, evaluate
    perform_action(action, game.current_player, game)
    state = get_state(game)
    
    #Pass to Tensorflow here to evaluate
    s_val = dqn.get_q_value(state, "dqn")

    print("Action:", action)
    print("Q value: %f", s_val)

    """
コード例 #4
0
def run_bbox():
	f_35_penalty = 0.15; k = 0; w0 = 0.13
	bbox.load_level("levels/test_level.data", verbose=0)
	has_next = True; last_score = 0
	act = -1; act_len = 0; crit_len = 150
	predict = np.zeros(2); cum_sum = np.zeros(4)
	while has_next:
		last_act = act
		state = bbox.get_state()
		predict[:2] = np.dot(lr_coefs_1,state[:-1]) + lr_free_coefs_1

		if state[35] > 0:
			cum_sum[1] = predict[0] + k
			cum_sum[2] = -predict[0] + k
		elif state[35] < 0:
			cum_sum[1] = -predict[1] + k
			cum_sum[2] = predict[1] + k
		elif state[35] == 0:
			cum_sum[1] = predict[0] + k
			cum_sum[2] = predict[1] + k

		cum_sum[0] = (cum_sum[1]+cum_sum[2])/2 + k
		cum_sum[1]-=f_35_penalty*state[35]
		cum_sum[2]+=f_35_penalty*state[35]
		if act_len > crit_len: cum_sum[last_act]-=0.0078125
		act = (w0*(np.dot(lr_coefs_0,state) + lr_free_coefs_0)/6.366 + (1-w0)*cum_sum).argmax()

		has_next = bbox.do_action(act)
		if last_act==act: act_len+=1
		else: act_len = 0

	bbox.finish(verbose=1)
コード例 #5
0
ファイル: bot_learn.py プロジェクト: kotulc/blackbox
def run_bbox(rnet_model, train_data, train_level=True, verbose=True):
    """
  Run a single session of the black box training or test environments
  :param rnet_model: model with a get_action(state) method
  :param train_data: a DataSet object used to buffer each state
  :param train_level: boolean, run the training level if True
  :param verbose: boolean, display additional information if True
  :return: float, the final session score
  """
    has_next = 1
    prepare_bbox(train_level)
    train_data.clear_buffer()

    while has_next:
        step_count = bbox.get_time()
        train_data.update_buffer(bbox.get_state())
        state = train_data.get_buffer()
        action = rnet_model.get_action(state)
        has_next = bbox.do_action(action)

        if step_count % 5000 == 0 and verbose:
            print("time = %d, score = %f" % (step_count, bbox.get_score()))

    final_score = bbox.finish(verbose=1)
    return final_score
コード例 #6
0
ファイル: bot_learn.py プロジェクト: kotulc/blackbox
def run_bbox(rnet_model, train_data,
             train_level=True, verbose=True):
  """
  Run a single session of the black box training or test environments
  :param rnet_model: model with a get_action(state) method
  :param train_data: a DataSet object used to buffer each state
  :param train_level: boolean, run the training level if True
  :param verbose: boolean, display additional information if True
  :return: float, the final session score
  """
  has_next = 1
  prepare_bbox(train_level)
  train_data.clear_buffer()

  while has_next:
    step_count = bbox.get_time()
    train_data.update_buffer(bbox.get_state())
    state = train_data.get_buffer()
    action = rnet_model.get_action(state)
    has_next = bbox.do_action(action)

    if step_count % 5000 == 0 and verbose:
      print ("time = %d, score = %f" % (step_count, bbox.get_score()))

  final_score = bbox.finish(verbose=1)
  return final_score
コード例 #7
0
ファイル: bboxgame.py プロジェクト: carlyboy76/my_bot
	def get_state(self):
		#im_size = (self.grid_size,) * 2
		#state = self.state[0]
		#canvas = np.zeros(im_size)
		#canvas[state[0], state[1]] = 1
		#canvas[-1, state[2]-1:state[2] + 2] = 1
		return bbox.get_state() #canvas
コード例 #8
0
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    #vector of the current state features
    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(1,n_features))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    attempt = lasagne.layers.get_output(agent)
    #function to do all of the stuff above
    eval_fn = theano.function([input_var], attempt,on_unused_input='ignore')
    #time to check how long it takes to run
    start = time.time()
    error=0
    steps=0
    while has_next:
        state = bbox.get_state()
        r_state= np.reshape(state,(1,n_features))
        attempt = eval_fn(r_state)
        action = np.argmax(attempt)
        steps+=1
        if steps%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   training loss: {}".format(error/steps))
            print ("   current score: {}".format(score))
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    print ("{} steps total".format(steps))
    np.savez('model.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
コード例 #9
0
ファイル: bot.py プロジェクト: zbxzc35/reinforcement-learning
def run_bbox(verbose=False):
    '''
    Runs the Blackbox challenge.
    '''
    
    has_next = True
    
    ## Prepare the environment -- load the game level
    prepare_bbox()
    
    while has_next:
        ## Get the current environment state vector
        state = bbox.get_state()
        ## Choose an action to perform at the current state
        action = get_action_by_state(state, verbose=verbose)
        ## Function do_action(action) returns False if the level
        ## is finished; otherwise, it returns True
        has_next = bbox.do_action(action)
    
    ## Save the interactions as an output CSV file
    headers = interaction_list.pop(0)
    interaction_df = pd.DataFrame(interaction_list, columns=headers)
    datetime_int = int(calendar.timegm(time.gmtime()))
    out_filename = '../output/interaction_' + str(datetime_int) + '.csv'
    interaction_df.to_csv(out_filename, index=False)
    print 'Saved to file: ' + out_filename
    
    ## When submitting solution, make sure to call finish(), which returns the sum of points obtained
    ## during the entire simulation. This number is used as the public leader board score
    bbox.finish(verbose=True)
コード例 #10
0
 def reset(self):
     #n = np.random.randint(0, self.grid_size-1, size=1)
     #m = np.random.randint(1, self.grid_size-2, size=1)
     if bbox.is_level_loaded():
         bbox.reset_level()
     else:
         bbox.load_level("../../../levels/train_level.data", verbose=1)
     self.state = bbox.get_state()  #np.asarray([0, n, m])[np.newaxis]
コード例 #11
0
ファイル: bboxgame.py プロジェクト: carlyboy76/my_bot
	def reset(self):
		#n = np.random.randint(0, self.grid_size-1, size=1)
		#m = np.random.randint(1, self.grid_size-2, size=1)
		if bbox.is_level_loaded():
			bbox.reset_level()
		else:
			bbox.load_level("../../../levels/train_level.data", verbose=1)
		self.state = bbox.get_state() #np.asarray([0, n, m])[np.newaxis]
コード例 #12
0
ファイル: bot06.py プロジェクト: gw0/blackbox-q-learning
 def update(self, action):
     self._actions_log.append(action[0])
     self._steps += 1
     self._prev_score = bbox.get_score()
     self._is_over = not bbox.do_action(action[0])
     self._state = bbox.get_state().reshape(self._state_shape)
     #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over
     return self.state, self.reward(), self.is_over
コード例 #13
0
ファイル: bot2.py プロジェクト: gw0/blackbox-q-learning
 def act(self, action):
     self._actions_log.append(action)
     self._steps += 1
     self._prev_score = bbox.get_score()
     self._is_over = not bbox.do_action(action)
     self._state = bbox.get_state().reshape((1, self.n_features))
     #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over
     return self.state, self.reward(), self.is_over
コード例 #14
0
ファイル: bot_learn.py プロジェクト: kotulc/blackbox
def action_lookup(model, train_data, step_inc):
    """
  At any given point, use action_lookup to determine the ideal action
  from the current state. Use the behavior of the model following each
  possible action to determine that which brings the greatest reward.
  :param model: object with a get_action method for action inference
  :param train_data: DataSet object used for bbox state buffering
  :param step_inc: int, the number of state steps to increment for each
    possible action of action_n total actions
  :return: (int, float), the tuple representing the highest scoring
    action
  """

    # Create a checkpoint to revert to after each action lookup
    start_checkpoint = bbox.create_checkpoint()
    # Similarly, create a backup of the DataSet object state buffer
    train_data.backup_buffer()
    best_score = -1e9
    best_action = -1

    # Perform the forward lookup for all valid actions
    for action_idx in xrange(action_n):
        start_score = bbox.get_score()
        bbox.do_action(action_idx)
        train_data.update_buffer(bbox.get_state())

        # After the initial action selection, use the model inference to
        # continue step_inc states into the future
        for _ in xrange(step_inc):
            action = model.get_action(train_data.get_buffer())
            bbox.do_action(action)
            train_data.update_buffer(bbox.get_state())

        # Check the score delta step_inc steps after the initial aciton
        end_score = bbox.get_score()
        score_delta = end_score - start_score
        if score_delta > best_score:
            best_score = score_delta
            best_action = action_idx
        bbox.load_from_checkpoint(start_checkpoint)
        train_data.restore_buffer()

    return best_action, best_score
コード例 #15
0
ファイル: bot_learn.py プロジェクト: kotulc/blackbox
def action_lookup(model, train_data, step_inc):
  """
  At any given point, use action_lookup to determine the ideal action
  from the current state. Use the behavior of the model following each
  possible action to determine that which brings the greatest reward.
  :param model: object with a get_action method for action inference
  :param train_data: DataSet object used for bbox state buffering
  :param step_inc: int, the number of state steps to increment for each
    possible action of action_n total actions
  :return: (int, float), the tuple representing the highest scoring
    action
  """

  # Create a checkpoint to revert to after each action lookup
  start_checkpoint = bbox.create_checkpoint()
  # Similarly, create a backup of the DataSet object state buffer
  train_data.backup_buffer()
  best_score = -1e9
  best_action = -1

  # Perform the forward lookup for all valid actions
  for action_idx in xrange(action_n):
    start_score = bbox.get_score()
    bbox.do_action(action_idx)
    train_data.update_buffer(bbox.get_state())

    # After the initial action selection, use the model inference to
    # continue step_inc states into the future
    for _ in xrange(step_inc):
      action = model.get_action(train_data.get_buffer())
      bbox.do_action(action)
      train_data.update_buffer(bbox.get_state())

    # Check the score delta step_inc steps after the initial aciton
    end_score = bbox.get_score()
    score_delta = end_score - start_score
    if score_delta > best_score:
      best_score = score_delta
      best_action = action_idx
    bbox.load_from_checkpoint(start_checkpoint)
    train_data.restore_buffer()

  return best_action, best_score
コード例 #16
0
def run_bbox(verbose=False):
    has_next = 1
    
    prepare_bbox()

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
 
    bbox.finish(verbose=1)
コード例 #17
0
ファイル: bot.py プロジェクト: zbxzc35/reinforcement-learning
def run_bbox():
	has_next = 1	
	prepare_bbox()
	load_regression_coefs("reg_coefs.txt")
	
	while has_next:
		state = bbox.get_state()
		action = get_action_by_state(state)
		has_next = bbox.do_action(action)
	
	bbox.finish(verbose=1)
コード例 #18
0
def run_bbox(verbose=False):
    has_next = 1
    
    prepare_bbox()
 
    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
 
    bbox.finish(verbose=1)
コード例 #19
0
def run_bbox():
    has_next = 1

    prepare_bbox()
    load_regression_coefs("reg_coefs.txt")

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)

    bbox.finish(verbose=1)
コード例 #20
0
ファイル: bot.py プロジェクト: gncgroup/BlackBox-Solver
def run_bbox():
	global ensamble
	has_next = 1 
	prepare_bbox()  
	ensamble=Ensemble.NN_Ensemble(n_features,4,[[36,64,4],[16,4],[16,4],[36,64,4]],n_actions)  
	ensamble.read_weights("weights")
	
	while has_next: 
		state = bbox.get_state() 
		action = get_action_by_state(state)
		has_next = bbox.do_action(action)   
		if(bbox.get_time()%10000==0): 
			print(str(bbox.get_time())+" "+str(bbox.get_score()))
	bbox.finish(verbose=1)
コード例 #21
0
ファイル: bot.py プロジェクト: zshell31/BlackBox-Solver
def run_bbox():
    global ensamble
    has_next = 1
    prepare_bbox()
    ensamble = Ensemble.NN_Ensemble(
        n_features, 4, [[36, 64, 4], [16, 4], [16, 4], [36, 64, 4]], n_actions)
    ensamble.read_weights("weights")

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
        if (bbox.get_time() % 10000 == 0):
            print(str(bbox.get_time()) + " " + str(bbox.get_score()))
    bbox.finish(verbose=1)
コード例 #22
0
ファイル: bot.py プロジェクト: amorgun/blackbox-2016
def run_bbox():
    
    start_time = time.time()
    
    has_next = 1
    
    prepare_bbox()
    coefs = load_regression_coefs("star 13-best_coefs_score=2980.401123046875_sigma=0.0010000000474974513_level=train_level.txt")
    state = np.ones(n_features + 1)
 
    while has_next:
        state[:-1] = bbox.get_state()
        action = get_action_by_state(state, coefs)
        has_next = bbox.do_action(action)
 
    bbox.finish(verbose=1)

    end_time = time.time()
    print(end_time - start_time)
コード例 #23
0
ファイル: bot.py プロジェクト: Shmuma/blackboxchallenge
def run_bbox(verbose=False):
    has_next = 1

    # Prepare environment - load the game level
    prepare_bbox()

    while has_next:
        # Get current environment state
        state = bbox.get_state()

        # Choose an action to perform at current step
        action = get_action_by_state(state)

        # Perform chosen action
        # Function do_action(action) returns False if level is finished, otherwise returns True.
        has_next = bbox.do_action(action)

    # Finish the game simulation, print earned reward
    # While submitting solutions, make sure that you do call finish()
    bbox.finish(verbose=1)
コード例 #24
0
def run_bbox(verbose=False):
    bbox.load_level("../levels/train_level.data", verbose=True)

    states, actions, scores, rewards = [], [], [], []
    with open('utility_models.pkl', 'rb') as f:
        utility_models = pickle.load(f)

    step = 0
    has_next = 1
    while has_next:
        step += 1
        state = bbox.get_state()
        action = np.random.choice(n_actions)
        utilities = [m.predict([state]) for m in utility_models]
        action = np.argmax(utilities)
        # Do action and bookkeeping
        has_next = bbox.do_action(action)
        states.append(np.array(state))
        actions.append(action)
        score = bbox.get_score()
        rewards.append(score if not scores else (score - scores[-1]))
        scores.append(score)
        if verbose and step % 10000 == 0:
            print(step, score)

    i = 1
    get_outdir = 'run_{}'.format
    outdir = get_outdir(i)
    while os.path.exists(outdir):
        i += 1
        outdir = get_outdir(i)
    os.mkdir(outdir)
    print('saving to {}'.format(outdir))
    scores = np.array(scores, dtype=np.float32)
    scores.tofile(os.path.join(outdir, 'scores'))
    actions = np.array(actions, dtype=np.int8)
    actions.tofile(os.path.join(outdir, 'actions'))
    states = np.array(states, dtype=np.float32)
    states.tofile(os.path.join(outdir, 'states'))

    bbox.finish(verbose=True)
コード例 #25
0
ファイル: bot.py プロジェクト: chiteri/BlackBox_Challenge
def run_bbox(verbose=False):
    has_next = 1

    # Prepare environment - Load the game level
    prepare_box()

    while has_next:
        # Get current environment state
        state = bbox.get_state()

        # Choose an action to perform at current step
        action = get_action_by_state(state)

        # Perform chosen action
        # Function do_action(action) returns False if level is finished,
        # Otherwise returns True
        has_next = bbox.do_action(action)

    # Finish the game simulation, print earned reward
    # While submitting solutions make sure you do call finish()
    bbox.finish(verbose=1)
コード例 #26
0
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    # vector of the current state features

    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = lasagne.layers.get_output(agent)[0]

    #function to do all of the stuff above
    test_fn = theano.function([input_var], attempt)
    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    consequence=0
    steps=0
    while has_next:
        memory = forget(memory)
        state = bbox.get_state()
        memory[0][:-2]=state
        choices = test_fn(memory)
        action = np.argmax(choices)
        has_next = bbox.do_action(action)
        score = bbox.get_score()
        consequence=score-consequence
        memory[0][-2:] = [action,consequence]
        steps+=1
        if steps%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   current score: {}".format(score))

    print ("Final Score: {}".format(score))
    print ("Time to run: {} seconds".format(time.time()-start))
    bbox.finish(verbose=1)
コード例 #27
0
def run_bbox(verbose=False):
    has_next = 1

    prepare_bbox()
    #vector of the current state features
    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(1,n_features))
    #vector of the scores for 100 of the same action
    target_var = T.dvector('scores')
    target_var = T.reshape(target_var,(1,n_actions))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    #what the agent thinks will happen if it does each action 100 times
    attempt = lasagne.layers.get_output(agent)
    #how much the agent was wrong, and should be punished
    punish = lasagne.objectives.squared_error(attempt,target_var)
    punish = punish.mean()
    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)
    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=.1,momentum=.9)
    #function to do all of the stuff above
    train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore')
    #time to check how long it takes to run
    start = time.time()
    while has_next:
        state = bbox.get_state()
        r_state= np.reshape(state,(1,n_features))
        scores = get_all_scores(state)
        r_scores = np.reshape(scores,(1,n_actions))
        action = T.argmax(scores)
        error = train_fn(r_state,r_scores)
        print (error)
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    bbox.finish(verbose=1)
コード例 #28
0
ファイル: bot_learn.py プロジェクト: kotulc/blackbox
def learn_bbox(rnet_model, train_data, update_inc=5000,
               lookup_inc=250, seed_data=False):
  """
  Add training instances to train_data from a single run-through of a
  bbox session.
  :param rnet_model: model object with get_lreg_action and get_action
    methods
  :param train_data: DataSet object used to buffer states and append
    new training instances
  :param update_inc: int, number of steps between each nnet model update
  :param lookup_inc: int, number of forward action lookup steps
  :param seed_data: boolean, sets best_action is the action returned by
    the lreg model.
  :return: int, the number of action errors, or differences between
    actions produced by the rnet_model and the ideal or seed model.
  """
  has_next = 1
  error_count = 0
  rand_count = 0
  rand_idx = rand_n

  prepare_bbox()
  # For each new state in the session, add it to the data set's state
  # buffer so that historical states are included in a commit event
  train_data.clear_buffer()
  current_state = bbox.get_state()
  train_data.update_buffer(current_state)

  while has_next:
    # If all random values have been used, generate a new batch
    if rand_idx >= (rand_n-1):
      rand_vals = numpy.random.random_sample(size=(rand_n))
      rand_idx = 0

    step_count = bbox.get_time()
    # Get the next action from the model based on the current set of
    # buffered states
    action = rnet_model.get_action(train_data.get_buffer())

    # Every update_inc steps train the model's network with newly
    # acquired training data
    if step_count % update_inc == 0:
      rn_model.run_training(train_data, max_steps=update_nnet, restore=True)
      error_count = 0
      rand_count = 0
    # If the random value is less than or equal to the sample
    # probability, sample the current session state and determine the
    # best action, adding it to the training set if necessary
    elif rand_vals[rand_idx] <= sample_prob:
      if seed_data:
        best_action = rnet_model.get_lreg_action(current_state)
        score_delta = 0.1
      else:
        best_action, score_delta = action_lookup(rnet_model,
                                                 train_data, lookup_inc)
      if action != best_action:
        train_data.commit_buffer(best_action, score_delta)
        error_count += 1
      rand_count += 1
    # Add random variation to the session by performing a random action
    # if less than or equal to perturb probability
    if rand_vals[rand_idx+1] <= perturb_prob:
      action = numpy.random.randint(0,4)
      step_inc = numpy.random.randint(rand_min, rand_max)
      for _ in xrange(step_inc):
        has_next = bbox.do_action(action)
        current_state = bbox.get_state()
        train_data.update_buffer(current_state)
    else:
      has_next = bbox.do_action(action)
      current_state = bbox.get_state()
      train_data.update_buffer(current_state)

    rand_idx += 2
    if step_count % 5000 == 0:
      print ("time = %d, score = %f" % (step_count, bbox.get_score()))
      print ("errors = %d, samples = %d" % (error_count, rand_count))
      #rn_model.print_stats()

  bbox.finish(verbose=1)
  return error_count
コード例 #29
0
def run_bbox(verbose=False):
    prepare_bbox()
    # vector of the current state features
    input_var= T.tensor3('memory')
    input_var= T.reshape(input_var,(memtime,1,n_f+2))

    #Score after the agent makes it's choice
    reality = T.vector('score_diffs')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks their best choice is this event
    evaluation = lasagne.layers.get_output(agent)[0]

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(evaluation,reality)
    reward = reward.mean()

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9)

    #A function to get the agent's choice of what to try this time
    decide_fn = theano.function([input_var],evaluation)

    #function to do all of the stuff above
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    start = time.time()
    for epoch in range(epochs):
        memory = np.zeros(shape=(memtime,1,n_f+2))
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox
        #initialize tracking variables
        consequence=error=0
        steps=0
        trust=0.00+.02*epoch
        good=0
        while has_next:
            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #get best action based on 100 step checkpoint method
            actuals = get_all_score_diffs(state)
            #upload new state, with no score or action chosen
            memory[0][0][:-2] = state
            if rand.random()>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = decide_fn(memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best


            if action == np.argmax(actuals):
                good = good+1
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            #find consequenquence
            score = bbox.get_score()
            consequence=score-consequence
            #train on choices just made and memory
            memory[0][0][-2:]=[action,consequence]

            error += train_fn(memory,actuals) #train based on the score change

            #updating for next loop
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   current trust: {}".format(trust))
                print ("   avg error: {}".format(error/steps))
                print ("   bad choices: {}%".format(100-float(good)/100))
                print ("   current score: {}".format(score))
                if trust<.95:
                    trust = trust+.02
                bbox.clear_all_checkpoints()
                ch=ra=good=0

        #report on model quality on previous epoch
        score = bbox.get_score()
        with open("epoch_data.txt","a") as f:
        	f.write("Epoch: {}    Final Score: {}    Average Error: {}    Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60))
        #save model parameters
        np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent))
        #reset box for next epoch
        if(epoch<epochs-1):
            bbox.reset_level()

    print ("Time to run: {} hours".format((time.time()-start)/3600))
    bbox.finish(verbose=1)
コード例 #30
0
    load_weights = True
    
    if training:
        for i in range(exploration_epochs):
            print(i, epsilon, gamma, action_repeat, update_frequency, batchSize, buffer)
            run_bbox(verbose=0, epsilon=epsilon, gamma=gamma, action_repeat=action_repeat, update_frequency=update_frequency, batchSize=batchSize, buffer=buffer, load_weights=False, save_weights=True)
            if epsilon > 0.1:
                epsilon -= (1.0/exploration_epochs)

        for i in range(learning_epochs):
            epsilon = 0.1
            print(i, epsilon, gamma, action_repeat, update_frequency, batchSize, buffer)
            run_bbox(verbose=0, epsilon=epsilon, gamma=gamma, action_repeat=action_repeat, update_frequency=update_frequency, batchSize=batchSize, buffer=buffer, load_weights=load_weights, save_weights=True)
            load_weights = False

    else:
        has_next = 1
        # Prepare environment - load the game level
        prepare_bbox()
        model.load_weights('_my_model_weights.h5')
        while has_next:
            # Get current environment state
            state = copy.copy(bbox.get_state())
            #Run the Q function on S to get predicted reward values on all the possible actions
            qval = model.predict(state.reshape(1,n_features), batch_size=1)
            # Choose an action to perform at current step
            action = (np.argmax(qval))
            has_next = bbox.do_action(action)
        # Finish the game simulation
        bbox.finish(verbose=1)
コード例 #31
0
ファイル: naive_bot.py プロジェクト: lopuhin/bbot
def run_bbox(verbose=False):
    bbox.load_level("../levels/train_level.data", verbose=True)

    states, actions, scores, rewards = [], [], [], []
    utility_models = [
        SGDRegressor(learning_rate='constant',
                     #penalty='elasticnet',
                     ) for _ in range(n_actions)
    ]
    zero_utilities = np.zeros([n_actions])

    n_past_act = 1
    n_past_st = 0  # in addition to current
    discount = 0.9
    random_steps = 10000

    step = 0
    has_next = 1
    while has_next:
        step += 1
        state = bbox.get_state()
        utilities = zero_utilities
        # Choose action using current utility_models
        if step > random_steps:
            clf_state = np.concatenate(states[-n_past_st:] + [state]) \
                        if n_past_st else state
            try:
                utilities = np.array(
                    [m.predict([clf_state])[0] for m in utility_models])
            except NotFittedError:
                pass
    #utilities -= utilities.min()
    #p = None if np.isclose(utilities, 0).all() else \
    #    utilities / utilities.sum()
        if np.random.rand() < 0.1 or step <= random_steps:
            action = np.random.choice(n_actions)
        else:
            action = np.argmax(utilities)
        # Do action and bookkeeping
        has_next = bbox.do_action(action)
        states.append(np.array(state))
        actions.append(action)
        score = bbox.get_score()
        rewards.append(score if not scores else (score - scores[-1]))
        scores.append(score)
        # Train classifiers
        if len(rewards) >= n_past_act + n_past_st:
            total_reward = sum(r * np.power(discount, i)
                               for i, r in enumerate(rewards[-n_past_act:]))
            if n_past_act == 1:
                clf_state = np.concatenate(states[-(n_past_act + n_past_st):])
            else:
                clf_state = np.concatenate(
                    states[-(n_past_act + n_past_st):-n_past_act + 1])
            utility_models[actions[-n_past_act]].partial_fit([clf_state],
                                                             [total_reward])
        if verbose and step % 1000 == 0:
            print(step, score)

    i = 1
    get_outdir = 'run_{}'.format
    outdir = get_outdir(i)
    while os.path.exists(outdir):
        i += 1
        outdir = get_outdir(i)
    os.mkdir(outdir)
    print('saving to {}'.format(outdir))
    scores = np.array(scores, dtype=np.float32)
    scores.tofile(os.path.join(outdir, 'scores'))
    actions = np.array(actions, dtype=np.int8)
    actions.tofile(os.path.join(outdir, 'actions'))
    states = np.array(states, dtype=np.float32)
    states.tofile(os.path.join(outdir, 'states'))

    bbox.finish(verbose=True)
コード例 #32
0
ファイル: q_learning.py プロジェクト: lopuhin/bbot
def main():
    epsilon = .1  # exploration
    num_actions = 4
    input_size = 36
    hidden_size = 24
    activation = 'relu'
    max_memory = 2000
    batch_size = 50
    mini_epoch = 5
    epoch = 10

    model = Sequential()
    model.add(
        Dense(hidden_size, input_shape=[input_size], activation=activation))
    model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(num_actions))
    model.compile('adam', 'mse')

    # model.load_weights('model.h5')

    # Define environment/game
    bbox.load_level('../levels/train_level.data', verbose=True)

    # Initialize experience replay object
    exp_replay = ExperienceReplay(max_memory=max_memory)

    # FIXME
    #states = np.fromfile('run_random/states', dtype=np.float32)\
    #    .reshape([1214494, 36])
    #scaler = preprocessing.StandardScaler()
    #scaler.fit(states)
    #with open('scaler.pkl', 'wb') as f:
    #    scaler = pickle.dump(scaler, f, protocol=-1)
    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    # Train
    for e in range(epoch):
        loss = 0.
        bbox.reset_level()
        game_over = False
        # get initial input
        get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0]
        input_t = get_state()
        score = 0
        step = 0
        report_steps = 100

        while not game_over:
            step += 1
            input_tm1 = input_t
            # get next action
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, num_actions, size=1)
            else:
                q = model.predict(np.array([input_tm1]))[0]
                action = np.argmax(q)

            # apply action, get rewards and new state
            game_over = not bbox.do_action(action)
            input_t = get_state()
            new_score = bbox.get_score()
            reward = new_score - score
            score = new_score

            # store experience
            exp_replay.remember([input_tm1, action, reward, input_t],
                                game_over)

            # adapt model
            for _ in range(mini_epoch):
                inputs, targets = exp_replay.get_batch(model,
                                                       batch_size=batch_size)
                loss += model.train_on_batch(inputs, targets)[0]

            if step % report_steps == 0:
                print('Step {:07d} | Loss {:.4f} | Score {}'.format(
                    step, loss / (report_steps * mini_epoch), score))
                loss = 0.

        print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score))

    # Save trained model weights
    model.save_weights('q_model.h5', overwrite=True)
コード例 #33
0
def run_bbox(verbose=False):
    prepare_bbox()

    # vector of the current state features
    input_var= T.matrix('memory')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Score after the agent makes it's choice
    reality = T.scalar('consequence')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = T.max(lasagne.layers.get_output(agent))

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(attempt,reality)

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9)

    #function to do all of the stuff above I DON'T HAVE A TARGET??
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    scores_per_epoch = np.zeros(epochs)
    for epoch in range(epochs):
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox

        #initialize tracking variables
        consequence=0
        self_assessment=0
        steps=0
        trust=0.00
        while has_next:

            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #upload new state, with no score or action chosen
            memory[0][:-2] = state
            if rand.random>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best
            
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            consequence = bbox.get_score()-consequence 
            
            #train on choices just made and memory
            memory[0][-2:]=[action,consequence]
            train_fn(memory,consequence) #train based on the score change
            
            #updating for next loop
            self_assessment += consequence
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                trust = trust+.01
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   self assessment: {}".format(self_assessment))
                print ("   trust: {}".format(trust))
                print ("   current score: {}".format(score))
        #report on model quality on previous epoch
        score = bbox.get_score()
        print ("Epoch: {}".format(epoch))
        print ("Final Score: {}".format(score))
        print ("Time to Run: {} minutes".format((time.time()-e_time)/60))
        scores_per_epoch[epoch] = score

        #reset box for next epoch
        bbox.reset_level()

    print ("All scores per epoch: ")
    print (scores_per_epoch)
    print ("Time to run: {} hours".format((time.time()-start)/3600))
    np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
コード例 #34
0
ファイル: show.py プロジェクト: Shmuma/blackboxchallenge

def get_action_by_state(state):
#    return np.random.randint(0, 4)
    return 0

if __name__ == "__main__":
    has_next = 1
    prepare_bbox()
    prev_score = bbox.get_score()
    steps = 0

    states = []

    while has_next and steps < 100:
        state = bbox.get_state()
        states.append(state)
        v = map(lambda f: "%.2f" % abs(f), state)
        print " ".join(v)
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
        score = bbox.get_score()
        prev_score = score
        steps += 1

#    bbox.finish(verbose=1)
    print "Total score: %f" % prev_score
    print "Total steps: %d" % steps

    img = np.stack(states)
    img -= img.mean()
コード例 #35
0
 def getSensors(self):
     state = bbox.get_state()
     print 'state', state
     return state
コード例 #36
0
def run_bbox(verbose=False, epsilon=0.1, gamma=0.99, action_repeat=4, update_frequency=4, batchSize=32, buffer=100000, load_weights=False, save_weights=False):
    has_next = 1
    
    # Prepare environment - load the game level
    prepare_bbox()
    
    update_frequency_cntr = 0
    replay = []
    h=0
    if load_weights:
        model.load_weights('my_model_weights.h5')
        model_prim.load_weights('my_model_weights.h5')
    #stores tuples of (S, A, R, S')
 
    while has_next:
        # Get current environment state
        state = copy.copy(bbox.get_state())
        prev_reward = copy.copy(bbox.get_score())
        
        #Run the Q function on S to get predicted reward values on all the possible actions
        qval = model.predict(state.reshape(1,n_features), batch_size=1)
 
        # Choose an action to perform at current step
        if random.random() < epsilon: #choose random action or best action
            if random.random() < 0.5:
                action = np.random.randint(0,n_actions) #assumes 4 different actions
            else: # Use checkpoints to prime network with good actions
                action_range=50 #random.randint(1,200)
                action = calc_best_action_using_checkpoint(action_range=action_range)
                #for _ in range(action_range):
                #    has_next = bbox.do_action(action)
        else: #choose best action from Q(s,a) values
            action = (np.argmax(qval))


        # Perform chosen action, observe new state S'
        # Function do_action(action) returns False if level is finished, otherwise returns True.
        for a in range(action_repeat):
            has_next = bbox.do_action(action)
        new_state = copy.copy(bbox.get_state())
        reward = copy.copy(bbox.get_score()) - prev_reward
        #reward = 1.0 if reward > 0.0 else -1.0 #this gives better than random when combined with a small network

        #Experience replay storage
        if (len(replay) < buffer): #if buffer not filled, add to it
            replay.append((state, action, reward, new_state))
        else: #if buffer full, overwrite old values
            if (h < (buffer-1)):
                h += 1
            else:
                h = 0
            replay[h] = (state, action, reward, new_state)

            #randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)
            X_train = []
            y_train = []
            for memory in minibatch:
                #Get max_Q(S',a)
                old_state, action, reward, new_state = memory
                old_qval = model.predict(old_state.reshape(1,n_features), batch_size=1)
                newQ = model.predict(new_state.reshape(1,n_features), batch_size=1)
                maxQ = np.max(newQ)
                y = np.zeros((1,n_actions))
                y[:] = old_qval[:]
                if has_next == 1: #non-terminal state
                    update = (reward + (gamma * maxQ))
                else: #terminal state
                    update = reward
                y[0][action] = update
                X_train.append(old_state)
                y_train.append(y.reshape(n_actions,))

            X_train = np.array(X_train)
            y_train = np.array(y_train)
            # update the weights of a copy of the network
            model_prim.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0)
            if update_frequency_cntr >= update_frequency:
                prim_weights = model_prim.get_weights()
                print('model update')
                model.set_weights(prim_weights)
                update_frequency_cntr = 0
            update_frequency_cntr += 1

        if bbox.get_time() % 500000 == 0:
            print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score()))


    # Finish the game simulation, print earned reward and save weights
    if save_weights:
        model_prim.save_weights('my_model_weights.h5', overwrite=True)
    bbox.finish(verbose=1)
コード例 #37
0
ファイル: bot.py プロジェクト: etrushkin/bb
 def get_state(self):
     return bbox.get_state()
コード例 #38
0
 def get_state(self):
     return bbox.get_state()
コード例 #39
0
ファイル: bot_learn.py プロジェクト: kotulc/blackbox
def learn_bbox(rnet_model,
               train_data,
               update_inc=5000,
               lookup_inc=250,
               seed_data=False):
    """
  Add training instances to train_data from a single run-through of a
  bbox session.
  :param rnet_model: model object with get_lreg_action and get_action
    methods
  :param train_data: DataSet object used to buffer states and append
    new training instances
  :param update_inc: int, number of steps between each nnet model update
  :param lookup_inc: int, number of forward action lookup steps
  :param seed_data: boolean, sets best_action is the action returned by
    the lreg model.
  :return: int, the number of action errors, or differences between
    actions produced by the rnet_model and the ideal or seed model.
  """
    has_next = 1
    error_count = 0
    rand_count = 0
    rand_idx = rand_n

    prepare_bbox()
    # For each new state in the session, add it to the data set's state
    # buffer so that historical states are included in a commit event
    train_data.clear_buffer()
    current_state = bbox.get_state()
    train_data.update_buffer(current_state)

    while has_next:
        # If all random values have been used, generate a new batch
        if rand_idx >= (rand_n - 1):
            rand_vals = numpy.random.random_sample(size=(rand_n))
            rand_idx = 0

        step_count = bbox.get_time()
        # Get the next action from the model based on the current set of
        # buffered states
        action = rnet_model.get_action(train_data.get_buffer())

        # Every update_inc steps train the model's network with newly
        # acquired training data
        if step_count % update_inc == 0:
            rn_model.run_training(train_data,
                                  max_steps=update_nnet,
                                  restore=True)
            error_count = 0
            rand_count = 0
        # If the random value is less than or equal to the sample
        # probability, sample the current session state and determine the
        # best action, adding it to the training set if necessary
        elif rand_vals[rand_idx] <= sample_prob:
            if seed_data:
                best_action = rnet_model.get_lreg_action(current_state)
                score_delta = 0.1
            else:
                best_action, score_delta = action_lookup(
                    rnet_model, train_data, lookup_inc)
            if action != best_action:
                train_data.commit_buffer(best_action, score_delta)
                error_count += 1
            rand_count += 1
        # Add random variation to the session by performing a random action
        # if less than or equal to perturb probability
        if rand_vals[rand_idx + 1] <= perturb_prob:
            action = numpy.random.randint(0, 4)
            step_inc = numpy.random.randint(rand_min, rand_max)
            for _ in xrange(step_inc):
                has_next = bbox.do_action(action)
                current_state = bbox.get_state()
                train_data.update_buffer(current_state)
        else:
            has_next = bbox.do_action(action)
            current_state = bbox.get_state()
            train_data.update_buffer(current_state)

        rand_idx += 2
        if step_count % 5000 == 0:
            print("time = %d, score = %f" % (step_count, bbox.get_score()))
            print("errors = %d, samples = %d" % (error_count, rand_count))
            #rn_model.print_stats()

    bbox.finish(verbose=1)
    return error_count