Exemple #1
0
def get_action_by_state(state, verbose=1):

    global k, q, act, rew, r, c, old_r, old_c, last_score, only_kohonen, start

    if k is None:
        k = Kohonen(100, 100, n_features, 10, 0.5)
        # k = Kohonen.from_file('kohonen1', 10, 0.5)
        # q = QLearning(50, 50, n_actions, 0.3, 0.3, 1)
        q = QLearning(50, 50, n_actions, 0, 0, 1)
        start = time.time()
        r, c = k.find_winner(state)
        act = q.get_action(r, c)
    else:
        if only_kohonen:
            parallel_kohonen_learning(state)
            # k.find_winner(state)
            act = random.randint(0, 3)
        else:
            rew = bbox.get_score() - rew
            old_r = r
            old_c = c
            r, c = k.find_winner(state)
            q.update_qvalue(old_r, old_c, act, rew, r, c)
            act = q.get_action(r, c)

    if verbose:
        # print(bbox.get_score(), bbox.get_score() - last_score, act)
        # last_score = bbox.get_score()
        if bbox.get_time() % 1000 == 0:
            print(bbox.get_time(), bbox.get_score(), time.time() - start)
            # k.save_to_file("k_iteration_normal_" + str(bbox.get_time()))

    return act
Exemple #2
0
def test_bot(bot, level, make_features):
    env = BBox(level)
    while env.has_next:
        if env.get_time() % 10000 == 0:
            print str(env.get_time()) + "\t" + str(env.get_score())
        action = bot.get_action(make_features(env))
        env.do_action(action)
    bbox.finish()
    print bbox.get_score()
Exemple #3
0
def test_bot(bot, level, make_features):
    env = BBox(level)
    while env.has_next:
        if env.get_time() % 10000 == 0:
            print str(env.get_time()) + "\t" + str(env.get_score())
        action = bot.get_action(make_features(env))
        env.do_action(action)
    bbox.finish()
    print bbox.get_score()
def get_all_score_diffs(state=None,verbose=0):
    initial = bbox.get_score()
    checkpoint_id = bbox.create_checkpoint()
    all_scores = np.zeros(shape=n_a)
    for a in range(n_a):
        for _ in range(100):
            bbox.do_action(a)
        all_scores[a]=bbox.get_score()-initial
        bbox.load_from_checkpoint(checkpoint_id)
    return all_scores
Exemple #5
0
	def get_score(self):
		#fruit_row, fruit_col, basket = self.state[0]
		#if fruit_row == self.grid_size-1:
		#	if abs(fruit_col - basket) <= 1:
		#		self.won = True
		#		return 1
		#	else:
		#		return -1
		#else:
		#	return 0
		self.action_score = bbox.get_score() - self.last_score
		self.last_score = bbox.get_score()
		return self.action_score #-1 if self.action_score < 0 else (1 if self.action_score > 0 else 0) # min(1, max(0,self.action_score))
Exemple #6
0
 def get_score(self):
     #fruit_row, fruit_col, basket = self.state[0]
     #if fruit_row == self.grid_size-1:
     #	if abs(fruit_col - basket) <= 1:
     #		self.won = True
     #		return 1
     #	else:
     #		return -1
     #else:
     #	return 0
     self.action_score = bbox.get_score() - self.last_score
     self.last_score = bbox.get_score()
     return self.action_score  #-1 if self.action_score < 0 else (1 if self.action_score > 0 else 0) # min(1, max(0,self.action_score))
Exemple #7
0
def run_bbox(rnet_model, train_data, train_level=True, verbose=True):
    """
  Run a single session of the black box training or test environments
  :param rnet_model: model with a get_action(state) method
  :param train_data: a DataSet object used to buffer each state
  :param train_level: boolean, run the training level if True
  :param verbose: boolean, display additional information if True
  :return: float, the final session score
  """
    has_next = 1
    prepare_bbox(train_level)
    train_data.clear_buffer()

    while has_next:
        step_count = bbox.get_time()
        train_data.update_buffer(bbox.get_state())
        state = train_data.get_buffer()
        action = rnet_model.get_action(state)
        has_next = bbox.do_action(action)

        if step_count % 5000 == 0 and verbose:
            print("time = %d, score = %f" % (step_count, bbox.get_score()))

    final_score = bbox.finish(verbose=1)
    return final_score
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    #vector of the current state features
    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(1,n_features))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    attempt = lasagne.layers.get_output(agent)
    #function to do all of the stuff above
    eval_fn = theano.function([input_var], attempt,on_unused_input='ignore')
    #time to check how long it takes to run
    start = time.time()
    error=0
    steps=0
    while has_next:
        state = bbox.get_state()
        r_state= np.reshape(state,(1,n_features))
        attempt = eval_fn(r_state)
        action = np.argmax(attempt)
        steps+=1
        if steps%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   training loss: {}".format(error/steps))
            print ("   current score: {}".format(score))
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    print ("{} steps total".format(steps))
    np.savez('model.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
def run_bbox(verbose=False):
    '''
    Runs the Blackbox challenge.
    '''
    
    has_next = True
    prepare_bbox()
    
    while has_next:
        ## Observe the current state variables
        state = bbox.get_state()
        state_tuple = get_state_tuple(state)
        ## Select the current action
        action = get_action(state_tuple, verbose=verbose, is_current=True)
        ## Get the current reward
        reward = bbox.get_score()
        print 'Reward = ' + str(reward)
        
        ## Retrieve the current Q-value
        current_q = q_function[state_tuple][action]
        print 'Current Q = ' + str(current_q)
        
        ## Observe the next state (assuming there always is)
        has_next = bbox.do_action(action)
        next_state = bbox.get_state()
        next_state_tuple = get_state_tuple(next_state)
        ## Get the best q_action in the new state
        next_action = get_action(next_state_tuple, verbose=verbose, is_current=False)    
        ## Get the new Q_value
        next_q = q_function[next_state_tuple][next_action]
        ## Update the Q-function
        q_function[state_tuple][action] = (1 - alpha) * current_q + alpha * (reward + gamma * next_q)
        print 'Updated Q = ' + str(q_function[state_tuple][action])
    
    bbox.finish(verbose=True)
Exemple #10
0
def run_bbox(rnet_model, train_data,
             train_level=True, verbose=True):
  """
  Run a single session of the black box training or test environments
  :param rnet_model: model with a get_action(state) method
  :param train_data: a DataSet object used to buffer each state
  :param train_level: boolean, run the training level if True
  :param verbose: boolean, display additional information if True
  :return: float, the final session score
  """
  has_next = 1
  prepare_bbox(train_level)
  train_data.clear_buffer()

  while has_next:
    step_count = bbox.get_time()
    train_data.update_buffer(bbox.get_state())
    state = train_data.get_buffer()
    action = rnet_model.get_action(state)
    has_next = bbox.do_action(action)

    if step_count % 5000 == 0 and verbose:
      print ("time = %d, score = %f" % (step_count, bbox.get_score()))

  final_score = bbox.finish(verbose=1)
  return final_score
Exemple #11
0
    def is_won(self):
        #fruit_row, fruit_col, basket = self.state[0]
        final_score = bbox.get_score()
        bbox.reset_level()  # bbox.finish(verbose=1)

        self.last_score = 0
        self.action_count = 0
        return final_score > 0  #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
Exemple #12
0
def calc_best_action_using_checkpoint():
    checkpoint_id = bbox.create_checkpoint()

    best_action = -1
    best_score = -1e9

    for action in range(n_actions):
        for _ in range(100):
            bbox.do_action(action)

        if bbox.get_score() > best_score:
            best_score = bbox.get_score()
            best_action = action

        bbox.load_from_checkpoint(checkpoint_id)

    return best_action
Exemple #13
0
 def update(self, action):
     self._actions_log.append(action[0])
     self._steps += 1
     self._prev_score = bbox.get_score()
     self._is_over = not bbox.do_action(action[0])
     self._state = bbox.get_state().reshape(self._state_shape)
     #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over
     return self.state, self.reward(), self.is_over
Exemple #14
0
	def is_won(self):
		#fruit_row, fruit_col, basket = self.state[0]
		final_score = bbox.get_score()
		bbox.reset_level() # bbox.finish(verbose=1)

		self.last_score = 0
		self.action_count = 0
		return final_score > 0 #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
Exemple #15
0
def get_all_scores(state,verbose=0):
    checkpoint_id = bbox.create_checkpoint()
    all_scores = np.array(1,n_actions)
    for a in range(n_actions):
        bbox.do_action(a)
        all_scores[a]=bbox.get_score()
        bbox.load_from_checkpoint(checkpoint_id)
    return all_scores
Exemple #16
0
 def act(self, action):
     self._actions_log.append(action)
     self._steps += 1
     self._prev_score = bbox.get_score()
     self._is_over = not bbox.do_action(action)
     self._state = bbox.get_state().reshape((1, self.n_features))
     #print "\nupdate", self._prev_score, action, bbox.get_score(), self._is_over
     return self.state, self.reward(), self.is_over
def calc_best_action_using_checkpoint():
	checkpoint_id = bbox.create_checkpoint()

	best_action = -1
	best_score = -1e9

	for action in range(n_actions):
		for _ in range(100):
			bbox.do_action(action)
		
		if bbox.get_score() > best_score:
			best_score = bbox.get_score()
			best_action = action

		bbox.load_from_checkpoint(checkpoint_id)

	return best_action
Exemple #18
0
def get_action_by_state(state, verbose=0):
    if verbose:
        for i in range(n_features):
            print ("state[%d] = %f" %  (i, state[i]))

        print ("score = {}, time = {}".format(bbox.get_score(), bbox.get_time()))

    action_to_do = 0
    return action_to_do
Exemple #19
0
def get_action_by_state(state, verbose=0):
    if verbose:
        for i in range(n_features):
            print ("state[%d] = %f" %  (i, state[i]))

        print ("score = {}, time = {}".format(bbox.get_score(), bbox.get_time()))

    action_to_do = 0
    return action_to_do
Exemple #20
0
def action_lookup(model, train_data, step_inc):
    """
  At any given point, use action_lookup to determine the ideal action
  from the current state. Use the behavior of the model following each
  possible action to determine that which brings the greatest reward.
  :param model: object with a get_action method for action inference
  :param train_data: DataSet object used for bbox state buffering
  :param step_inc: int, the number of state steps to increment for each
    possible action of action_n total actions
  :return: (int, float), the tuple representing the highest scoring
    action
  """

    # Create a checkpoint to revert to after each action lookup
    start_checkpoint = bbox.create_checkpoint()
    # Similarly, create a backup of the DataSet object state buffer
    train_data.backup_buffer()
    best_score = -1e9
    best_action = -1

    # Perform the forward lookup for all valid actions
    for action_idx in xrange(action_n):
        start_score = bbox.get_score()
        bbox.do_action(action_idx)
        train_data.update_buffer(bbox.get_state())

        # After the initial action selection, use the model inference to
        # continue step_inc states into the future
        for _ in xrange(step_inc):
            action = model.get_action(train_data.get_buffer())
            bbox.do_action(action)
            train_data.update_buffer(bbox.get_state())

        # Check the score delta step_inc steps after the initial aciton
        end_score = bbox.get_score()
        score_delta = end_score - start_score
        if score_delta > best_score:
            best_score = score_delta
            best_action = action_idx
        bbox.load_from_checkpoint(start_checkpoint)
        train_data.restore_buffer()

    return best_action, best_score
Exemple #21
0
def calc_best_action_using_checkpoint(action_range=50):
	
	# Pretty straightforward — we create a checkpoint and get it's ID 
	checkpoint_id = bbox.create_checkpoint()
 
	best_action = -1
	best_score = -1e9
 
	for action in range(n_actions):
		for _ in range(action_range): #random.randint(1,100)
			bbox.do_action(action)
		
		if bbox.get_score() > best_score:
			best_score = bbox.get_score()
			best_action = action
 
		bbox.load_from_checkpoint(checkpoint_id)
 
	return best_action
Exemple #22
0
def action_lookup(model, train_data, step_inc):
  """
  At any given point, use action_lookup to determine the ideal action
  from the current state. Use the behavior of the model following each
  possible action to determine that which brings the greatest reward.
  :param model: object with a get_action method for action inference
  :param train_data: DataSet object used for bbox state buffering
  :param step_inc: int, the number of state steps to increment for each
    possible action of action_n total actions
  :return: (int, float), the tuple representing the highest scoring
    action
  """

  # Create a checkpoint to revert to after each action lookup
  start_checkpoint = bbox.create_checkpoint()
  # Similarly, create a backup of the DataSet object state buffer
  train_data.backup_buffer()
  best_score = -1e9
  best_action = -1

  # Perform the forward lookup for all valid actions
  for action_idx in xrange(action_n):
    start_score = bbox.get_score()
    bbox.do_action(action_idx)
    train_data.update_buffer(bbox.get_state())

    # After the initial action selection, use the model inference to
    # continue step_inc states into the future
    for _ in xrange(step_inc):
      action = model.get_action(train_data.get_buffer())
      bbox.do_action(action)
      train_data.update_buffer(bbox.get_state())

    # Check the score delta step_inc steps after the initial aciton
    end_score = bbox.get_score()
    score_delta = end_score - start_score
    if score_delta > best_score:
      best_score = score_delta
      best_action = action_idx
    bbox.load_from_checkpoint(start_checkpoint)
    train_data.restore_buffer()

  return best_action, best_score
Exemple #23
0
def get_action_by_state(state, verbose=1):
    global action_to_do
    if verbose:
        #for i in range(n_features):
        #    print ("state[%d] = %f" %  (i, state[i]))
        if bbox.get_time() % 1000 == 0:
            print ("score = {}, time = {}".format(bbox.get_score(), bbox.get_time()))

    action_to_do = action_to_do + 1
    if action_to_do == 4:
        action_to_do = 0
    return action_to_do
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    # vector of the current state features

    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = lasagne.layers.get_output(agent)[0]

    #function to do all of the stuff above
    test_fn = theano.function([input_var], attempt)
    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    consequence=0
    steps=0
    while has_next:
        memory = forget(memory)
        state = bbox.get_state()
        memory[0][:-2]=state
        choices = test_fn(memory)
        action = np.argmax(choices)
        has_next = bbox.do_action(action)
        score = bbox.get_score()
        consequence=score-consequence
        memory[0][-2:] = [action,consequence]
        steps+=1
        if steps%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   current score: {}".format(score))

    print ("Final Score: {}".format(score))
    print ("Time to run: {} seconds".format(time.time()-start))
    bbox.finish(verbose=1)
Exemple #25
0
def run_bbox():
	global ensamble
	has_next = 1 
	prepare_bbox()  
	ensamble=Ensemble.NN_Ensemble(n_features,4,[[36,64,4],[16,4],[16,4],[36,64,4]],n_actions)  
	ensamble.read_weights("weights")
	
	while has_next: 
		state = bbox.get_state() 
		action = get_action_by_state(state)
		has_next = bbox.do_action(action)   
		if(bbox.get_time()%10000==0): 
			print(str(bbox.get_time())+" "+str(bbox.get_score()))
	bbox.finish(verbose=1)
def run_bbox():
	has_next = 1
	
	prepare_bbox()

	while has_next:
		best_act = calc_best_action_using_checkpoint()	
		for _ in range(100):
			has_next = bbox.do_action(best_act)

		if bbox.get_time() % 10000 == 0:
			print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score()))
 
	bbox.finish(verbose=1)
Exemple #27
0
    def play(self, action, report_action=False):
        #state = self.state

        printing = False  # SET PRINTING HERE *********************************************************************

        self.action_count = self.action_count + 1
        if report_action and printing:
            print
            print
            print("PRE ACTN#%d: time=%fs   total score=%f" %
                  (self.action_count,
                   (dt.datetime.now() - self.time).seconds, bbox.get_score()))
            self.time = dt.datetime.now()
        self.has_next = bbox.do_action(action)
Exemple #28
0
def get_action_by_state(state, verbose=0): 
    
    # If verbose = True enable detailed logging to console 
    if verbose: 
        # Print environment state vector 
        for i in range (n_features): 
            print("state[%d] = %f" % (i, state[i]))
            
        # Print current score and time (number of current game step)
        print("score = {}, time={}".format(bbox.get_score(), bbox.get_time()))
        
    # This simple bot always performs action number 0. Not so smart :) 
    # action_to_do = 0
    action_to_do = random.randint(0, 3) # Choose a random integer with value between 0 and 3
    return action_to_do
Exemple #29
0
def run_bbox():
    global ensamble
    has_next = 1
    prepare_bbox()
    ensamble = Ensemble.NN_Ensemble(
        n_features, 4, [[36, 64, 4], [16, 4], [16, 4], [36, 64, 4]], n_actions)
    ensamble.read_weights("weights")

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
        if (bbox.get_time() % 10000 == 0):
            print(str(bbox.get_time()) + " " + str(bbox.get_score()))
    bbox.finish(verbose=1)
Exemple #30
0
def run_bbox():
    has_next = 1

    prepare_bbox()

    while has_next:
        best_act = calc_best_action_using_checkpoint()
        for _ in range(100):
            has_next = bbox.do_action(best_act)

        if bbox.get_time() % 10000 == 0:
            print("time = %d, score = %f" %
                  (bbox.get_time(), bbox.get_score()))

    bbox.finish(verbose=1)
Exemple #31
0
def get_action_by_state(state, verbose=0):

    # If verbose = True enable detailed logging to console
    if verbose:
        # Print environment state vector
        for i in range(n_features):
            print("state[%d] = %f" % (i, state[i]))

        # Print current score and time (number of current game step)
        print("score = {}, time={}".format(bbox.get_score(), bbox.get_time()))

    # This simple bot always performs action number 0. Not so smart :)
    # action_to_do = 0
    action_to_do = random.randint(
        0, 3)  # Choose a random integer with value between 0 and 3
    return action_to_do
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    # vector of the current state features
    input_var= T.matrix('in_state')
    input_var= T.reshape(input_var,(1000,n_features))
    #vector of the scores for 100 of the same action
    target_var = T.matrix('scores')
    target_var = T.reshape(target_var,(1000,n_actions))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    #what the agent thinks will happen if it does each action 100 times
    attempt = lasagne.layers.get_output(agent)
    #how much the agent was wrong, and should be punished
    punish = lasagne.objectives.squared_error(attempt,target_var)
    punish = punish.mean()
    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)
    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=0.001,momentum=0.9)
    #function to do all of the stuff above
    train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore')
    # time to check how long it takes to run
    start = time.time()

    states, scores, loops = load_dataset('Full.txt')
    for n in range(loops):
        error=0
        steps=0
        ins = states[n:n+15]
        out = scores[n:n+15]
        action = np.argmax(out[0])
        error = train_fn(ins,out)
        if n%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   training loss: {}".format(error))
            print ("   current score: {}".format(score))
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    np.savez('model.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
Exemple #33
0
    def is_over(self):
        score = bbox.get_score()
        if score > self._epoch_max:
            self._epoch_max = score  # remember max score

        if self._steps >= self.train_steps:
            print "\nover (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max)
            print self._actions_log

            #if score == self._epoch_prev or score == self._epoch_max:
            self.train_steps += 0.1  # increase steps after a while
            self._epoch_prev = score
            return True

        if score < -1. and score < -self._epoch_max / 2:
            print "\ndead (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max)
            print self._actions_log
            return True
        return self._is_over
Exemple #34
0
    def is_over(self):
        score = bbox.get_score()
        if score > self._epoch_max:
            self._epoch_max = score  # remember max score

        if self._steps >= self.train_steps:
            print "\nover (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max)
            print self._actions_log

            if score == self._epoch_prev or score == self._epoch_max:
                self.train_steps += 1
            #self.train_steps += 0.1  # slowly increase steps
            self._epoch_prev = score
            return True

        if score < -1. and score < -self._epoch_max / 2:
            print "\ndead (steps: {}/{}, score: {:.5}/{:.5})".format(self._steps, self.train_steps, score, self._epoch_max)
            print self._actions_log
            return True
        return self._is_over
Exemple #35
0
def run_bbox(verbose=False):
    bbox.load_level("../levels/train_level.data", verbose=True)

    states, actions, scores, rewards = [], [], [], []
    with open('utility_models.pkl', 'rb') as f:
        utility_models = pickle.load(f)

    step = 0
    has_next = 1
    while has_next:
        step += 1
        state = bbox.get_state()
        action = np.random.choice(n_actions)
        utilities = [m.predict([state]) for m in utility_models]
        action = np.argmax(utilities)
        # Do action and bookkeeping
        has_next = bbox.do_action(action)
        states.append(np.array(state))
        actions.append(action)
        score = bbox.get_score()
        rewards.append(score if not scores else (score - scores[-1]))
        scores.append(score)
        if verbose and step % 10000 == 0:
            print(step, score)

    i = 1
    get_outdir = 'run_{}'.format
    outdir = get_outdir(i)
    while os.path.exists(outdir):
        i += 1
        outdir = get_outdir(i)
    os.mkdir(outdir)
    print('saving to {}'.format(outdir))
    scores = np.array(scores, dtype=np.float32)
    scores.tofile(os.path.join(outdir, 'scores'))
    actions = np.array(actions, dtype=np.int8)
    actions.tofile(os.path.join(outdir, 'actions'))
    states = np.array(states, dtype=np.float32)
    states.tofile(os.path.join(outdir, 'states'))

    bbox.finish(verbose=True)
Exemple #36
0
def get_action_by_state(state, verbose=False):
    '''
    This is the policy function. It takes the environment state vector and returns an
    action that the agent performs. It suffices to only modify this function to create
    a proper learning agent.
    '''
    
    an_interaction = []
    if verbose: # enables detailed logging
        for i in range(n_features):
            ## Print the environment state vector
#             print ("state[%d] = %f" % (i, state[i]))
            an_interaction.append(state[i])
        ## Print the current score and time (number of current game steps)
        reward = bbox.get_score()
        an_interaction.append(reward)
#         print ("score = {}, time = {}".format(reward, bbox.get_time()))
        
    ## TODO: Change this action
    action_to_do = random.random()
    an_interaction.append(action_to_do)
    interaction_list.append(an_interaction)
    return action_to_do
def run_bbox(verbose=False):
    prepare_bbox()
    # vector of the current state features
    input_var= T.tensor3('memory')
    input_var= T.reshape(input_var,(memtime,1,n_f+2))

    #Score after the agent makes it's choice
    reality = T.vector('score_diffs')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks their best choice is this event
    evaluation = lasagne.layers.get_output(agent)[0]

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(evaluation,reality)
    reward = reward.mean()

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9)

    #A function to get the agent's choice of what to try this time
    decide_fn = theano.function([input_var],evaluation)

    #function to do all of the stuff above
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    start = time.time()
    for epoch in range(epochs):
        memory = np.zeros(shape=(memtime,1,n_f+2))
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox
        #initialize tracking variables
        consequence=error=0
        steps=0
        trust=0.00+.02*epoch
        good=0
        while has_next:
            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #get best action based on 100 step checkpoint method
            actuals = get_all_score_diffs(state)
            #upload new state, with no score or action chosen
            memory[0][0][:-2] = state
            if rand.random()>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = decide_fn(memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best


            if action == np.argmax(actuals):
                good = good+1
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            #find consequenquence
            score = bbox.get_score()
            consequence=score-consequence
            #train on choices just made and memory
            memory[0][0][-2:]=[action,consequence]

            error += train_fn(memory,actuals) #train based on the score change

            #updating for next loop
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   current trust: {}".format(trust))
                print ("   avg error: {}".format(error/steps))
                print ("   bad choices: {}%".format(100-float(good)/100))
                print ("   current score: {}".format(score))
                if trust<.95:
                    trust = trust+.02
                bbox.clear_all_checkpoints()
                ch=ra=good=0

        #report on model quality on previous epoch
        score = bbox.get_score()
        with open("epoch_data.txt","a") as f:
        	f.write("Epoch: {}    Final Score: {}    Average Error: {}    Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60))
        #save model parameters
        np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent))
        #reset box for next epoch
        if(epoch<epochs-1):
            bbox.reset_level()

    print ("Time to run: {} hours".format((time.time()-start)/3600))
    bbox.finish(verbose=1)
Exemple #38
0
def run_bbox(verbose=False, epsilon=0.1, gamma=0.99, action_repeat=4, update_frequency=4, batchSize=32, buffer=100000, load_weights=False, save_weights=False):
    has_next = 1
    
    # Prepare environment - load the game level
    prepare_bbox()
    
    update_frequency_cntr = 0
    replay = []
    h=0
    if load_weights:
        model.load_weights('my_model_weights.h5')
        model_prim.load_weights('my_model_weights.h5')
    #stores tuples of (S, A, R, S')
 
    while has_next:
        # Get current environment state
        state = copy.copy(bbox.get_state())
        prev_reward = copy.copy(bbox.get_score())
        
        #Run the Q function on S to get predicted reward values on all the possible actions
        qval = model.predict(state.reshape(1,n_features), batch_size=1)
 
        # Choose an action to perform at current step
        if random.random() < epsilon: #choose random action or best action
            if random.random() < 0.5:
                action = np.random.randint(0,n_actions) #assumes 4 different actions
            else: # Use checkpoints to prime network with good actions
                action_range=50 #random.randint(1,200)
                action = calc_best_action_using_checkpoint(action_range=action_range)
                #for _ in range(action_range):
                #    has_next = bbox.do_action(action)
        else: #choose best action from Q(s,a) values
            action = (np.argmax(qval))


        # Perform chosen action, observe new state S'
        # Function do_action(action) returns False if level is finished, otherwise returns True.
        for a in range(action_repeat):
            has_next = bbox.do_action(action)
        new_state = copy.copy(bbox.get_state())
        reward = copy.copy(bbox.get_score()) - prev_reward
        #reward = 1.0 if reward > 0.0 else -1.0 #this gives better than random when combined with a small network

        #Experience replay storage
        if (len(replay) < buffer): #if buffer not filled, add to it
            replay.append((state, action, reward, new_state))
        else: #if buffer full, overwrite old values
            if (h < (buffer-1)):
                h += 1
            else:
                h = 0
            replay[h] = (state, action, reward, new_state)

            #randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)
            X_train = []
            y_train = []
            for memory in minibatch:
                #Get max_Q(S',a)
                old_state, action, reward, new_state = memory
                old_qval = model.predict(old_state.reshape(1,n_features), batch_size=1)
                newQ = model.predict(new_state.reshape(1,n_features), batch_size=1)
                maxQ = np.max(newQ)
                y = np.zeros((1,n_actions))
                y[:] = old_qval[:]
                if has_next == 1: #non-terminal state
                    update = (reward + (gamma * maxQ))
                else: #terminal state
                    update = reward
                y[0][action] = update
                X_train.append(old_state)
                y_train.append(y.reshape(n_actions,))

            X_train = np.array(X_train)
            y_train = np.array(y_train)
            # update the weights of a copy of the network
            model_prim.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0)
            if update_frequency_cntr >= update_frequency:
                prim_weights = model_prim.get_weights()
                print('model update')
                model.set_weights(prim_weights)
                update_frequency_cntr = 0
            update_frequency_cntr += 1

        if bbox.get_time() % 500000 == 0:
            print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score()))


    # Finish the game simulation, print earned reward and save weights
    if save_weights:
        model_prim.save_weights('my_model_weights.h5', overwrite=True)
    bbox.finish(verbose=1)
def run_bbox(verbose=False):
    prepare_bbox()

    # vector of the current state features
    input_var= T.matrix('memory')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Score after the agent makes it's choice
    reality = T.scalar('consequence')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = T.max(lasagne.layers.get_output(agent))

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(attempt,reality)

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9)

    #function to do all of the stuff above I DON'T HAVE A TARGET??
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    scores_per_epoch = np.zeros(epochs)
    for epoch in range(epochs):
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox

        #initialize tracking variables
        consequence=0
        self_assessment=0
        steps=0
        trust=0.00
        while has_next:

            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #upload new state, with no score or action chosen
            memory[0][:-2] = state
            if rand.random>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best
            
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            consequence = bbox.get_score()-consequence 
            
            #train on choices just made and memory
            memory[0][-2:]=[action,consequence]
            train_fn(memory,consequence) #train based on the score change
            
            #updating for next loop
            self_assessment += consequence
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                trust = trust+.01
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   self assessment: {}".format(self_assessment))
                print ("   trust: {}".format(trust))
                print ("   current score: {}".format(score))
        #report on model quality on previous epoch
        score = bbox.get_score()
        print ("Epoch: {}".format(epoch))
        print ("Final Score: {}".format(score))
        print ("Time to Run: {} minutes".format((time.time()-e_time)/60))
        scores_per_epoch[epoch] = score

        #reset box for next epoch
        bbox.reset_level()

    print ("All scores per epoch: ")
    print (scores_per_epoch)
    print ("Time to run: {} hours".format((time.time()-start)/3600))
    np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
Exemple #40
0
    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()


def get_action_by_state(state):
#    return np.random.randint(0, 4)
    return 0

if __name__ == "__main__":
    has_next = 1
    prepare_bbox()
    prev_score = bbox.get_score()
    steps = 0

    states = []

    while has_next and steps < 100:
        state = bbox.get_state()
        states.append(state)
        v = map(lambda f: "%.2f" % abs(f), state)
        print " ".join(v)
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
        score = bbox.get_score()
        prev_score = score
        steps += 1
Exemple #41
0
 def get_score(self):
     return bbox.get_score()
Exemple #42
0
def run_bbox(verbose=False):
    bbox.load_level("../levels/train_level.data", verbose=True)

    states, actions, scores, rewards = [], [], [], []
    utility_models = [
        SGDRegressor(learning_rate='constant',
                     #penalty='elasticnet',
                     ) for _ in range(n_actions)
    ]
    zero_utilities = np.zeros([n_actions])

    n_past_act = 1
    n_past_st = 0  # in addition to current
    discount = 0.9
    random_steps = 10000

    step = 0
    has_next = 1
    while has_next:
        step += 1
        state = bbox.get_state()
        utilities = zero_utilities
        # Choose action using current utility_models
        if step > random_steps:
            clf_state = np.concatenate(states[-n_past_st:] + [state]) \
                        if n_past_st else state
            try:
                utilities = np.array(
                    [m.predict([clf_state])[0] for m in utility_models])
            except NotFittedError:
                pass
    #utilities -= utilities.min()
    #p = None if np.isclose(utilities, 0).all() else \
    #    utilities / utilities.sum()
        if np.random.rand() < 0.1 or step <= random_steps:
            action = np.random.choice(n_actions)
        else:
            action = np.argmax(utilities)
        # Do action and bookkeeping
        has_next = bbox.do_action(action)
        states.append(np.array(state))
        actions.append(action)
        score = bbox.get_score()
        rewards.append(score if not scores else (score - scores[-1]))
        scores.append(score)
        # Train classifiers
        if len(rewards) >= n_past_act + n_past_st:
            total_reward = sum(r * np.power(discount, i)
                               for i, r in enumerate(rewards[-n_past_act:]))
            if n_past_act == 1:
                clf_state = np.concatenate(states[-(n_past_act + n_past_st):])
            else:
                clf_state = np.concatenate(
                    states[-(n_past_act + n_past_st):-n_past_act + 1])
            utility_models[actions[-n_past_act]].partial_fit([clf_state],
                                                             [total_reward])
        if verbose and step % 1000 == 0:
            print(step, score)

    i = 1
    get_outdir = 'run_{}'.format
    outdir = get_outdir(i)
    while os.path.exists(outdir):
        i += 1
        outdir = get_outdir(i)
    os.mkdir(outdir)
    print('saving to {}'.format(outdir))
    scores = np.array(scores, dtype=np.float32)
    scores.tofile(os.path.join(outdir, 'scores'))
    actions = np.array(actions, dtype=np.int8)
    actions.tofile(os.path.join(outdir, 'actions'))
    states = np.array(states, dtype=np.float32)
    states.tofile(os.path.join(outdir, 'states'))

    bbox.finish(verbose=True)
Exemple #43
0
 def getReward(self):
     cur_reward = self.lastreward
     self.lastreward = bbox.get_score()
     print 'lastreward', self.lastreward
     return cur_reward
Exemple #44
0
 def get_score(self):
     return bbox.get_score()
Exemple #45
0
 def reward(self):
     reward = bbox.get_score() - self._prev_score
     return reward
Exemple #46
0
 def reward(self):
     reward = bbox.get_score() - self._prev_score
     return reward
Exemple #47
0
def learn_bbox(rnet_model,
               train_data,
               update_inc=5000,
               lookup_inc=250,
               seed_data=False):
    """
  Add training instances to train_data from a single run-through of a
  bbox session.
  :param rnet_model: model object with get_lreg_action and get_action
    methods
  :param train_data: DataSet object used to buffer states and append
    new training instances
  :param update_inc: int, number of steps between each nnet model update
  :param lookup_inc: int, number of forward action lookup steps
  :param seed_data: boolean, sets best_action is the action returned by
    the lreg model.
  :return: int, the number of action errors, or differences between
    actions produced by the rnet_model and the ideal or seed model.
  """
    has_next = 1
    error_count = 0
    rand_count = 0
    rand_idx = rand_n

    prepare_bbox()
    # For each new state in the session, add it to the data set's state
    # buffer so that historical states are included in a commit event
    train_data.clear_buffer()
    current_state = bbox.get_state()
    train_data.update_buffer(current_state)

    while has_next:
        # If all random values have been used, generate a new batch
        if rand_idx >= (rand_n - 1):
            rand_vals = numpy.random.random_sample(size=(rand_n))
            rand_idx = 0

        step_count = bbox.get_time()
        # Get the next action from the model based on the current set of
        # buffered states
        action = rnet_model.get_action(train_data.get_buffer())

        # Every update_inc steps train the model's network with newly
        # acquired training data
        if step_count % update_inc == 0:
            rn_model.run_training(train_data,
                                  max_steps=update_nnet,
                                  restore=True)
            error_count = 0
            rand_count = 0
        # If the random value is less than or equal to the sample
        # probability, sample the current session state and determine the
        # best action, adding it to the training set if necessary
        elif rand_vals[rand_idx] <= sample_prob:
            if seed_data:
                best_action = rnet_model.get_lreg_action(current_state)
                score_delta = 0.1
            else:
                best_action, score_delta = action_lookup(
                    rnet_model, train_data, lookup_inc)
            if action != best_action:
                train_data.commit_buffer(best_action, score_delta)
                error_count += 1
            rand_count += 1
        # Add random variation to the session by performing a random action
        # if less than or equal to perturb probability
        if rand_vals[rand_idx + 1] <= perturb_prob:
            action = numpy.random.randint(0, 4)
            step_inc = numpy.random.randint(rand_min, rand_max)
            for _ in xrange(step_inc):
                has_next = bbox.do_action(action)
                current_state = bbox.get_state()
                train_data.update_buffer(current_state)
        else:
            has_next = bbox.do_action(action)
            current_state = bbox.get_state()
            train_data.update_buffer(current_state)

        rand_idx += 2
        if step_count % 5000 == 0:
            print("time = %d, score = %f" % (step_count, bbox.get_score()))
            print("errors = %d, samples = %d" % (error_count, rand_count))
            #rn_model.print_stats()

    bbox.finish(verbose=1)
    return error_count
Exemple #48
0
def get_action_by_state(state):
    action = seq[bbox.get_time() % len(seq)] #random.randint(0, n_actions-1)
    if bbox.get_time() % 1000 == 0:
        print bbox.get_time(), bbox.get_score()
    print state
    return action
Exemple #49
0
def main():
    epsilon = .1  # exploration
    num_actions = 4
    input_size = 36
    hidden_size = 24
    activation = 'relu'
    max_memory = 2000
    batch_size = 50
    mini_epoch = 5
    epoch = 10

    model = Sequential()
    model.add(
        Dense(hidden_size, input_shape=[input_size], activation=activation))
    model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(num_actions))
    model.compile('adam', 'mse')

    # model.load_weights('model.h5')

    # Define environment/game
    bbox.load_level('../levels/train_level.data', verbose=True)

    # Initialize experience replay object
    exp_replay = ExperienceReplay(max_memory=max_memory)

    # FIXME
    #states = np.fromfile('run_random/states', dtype=np.float32)\
    #    .reshape([1214494, 36])
    #scaler = preprocessing.StandardScaler()
    #scaler.fit(states)
    #with open('scaler.pkl', 'wb') as f:
    #    scaler = pickle.dump(scaler, f, protocol=-1)
    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    # Train
    for e in range(epoch):
        loss = 0.
        bbox.reset_level()
        game_over = False
        # get initial input
        get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0]
        input_t = get_state()
        score = 0
        step = 0
        report_steps = 100

        while not game_over:
            step += 1
            input_tm1 = input_t
            # get next action
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, num_actions, size=1)
            else:
                q = model.predict(np.array([input_tm1]))[0]
                action = np.argmax(q)

            # apply action, get rewards and new state
            game_over = not bbox.do_action(action)
            input_t = get_state()
            new_score = bbox.get_score()
            reward = new_score - score
            score = new_score

            # store experience
            exp_replay.remember([input_tm1, action, reward, input_t],
                                game_over)

            # adapt model
            for _ in range(mini_epoch):
                inputs, targets = exp_replay.get_batch(model,
                                                       batch_size=batch_size)
                loss += model.train_on_batch(inputs, targets)[0]

            if step % report_steps == 0:
                print('Step {:07d} | Loss {:.4f} | Score {}'.format(
                    step, loss / (report_steps * mini_epoch), score))
                loss = 0.

        print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score))

    # Save trained model weights
    model.save_weights('q_model.h5', overwrite=True)