Ejemplo n.º 1
0
import pickle
from pcca import PCCA

f = open('tmatrixperfect.dat','r')
a = pickle.Unpickler(f)
tm = a.load()

pccaobj = PCCA(True)
chi_mat = pccaobj.pcca(tm)

print tm.shape
print chi_mat.shape

outfile = open('chi_mat.dat','w')
pickle.dump(chi_mat,outfile)
Ejemplo n.º 2
0
    def agent_init(self,taskSpecString):
        self.policy_frozen = False
        self.total_steps = 0
        self.trial_start = 0.0
        self.total_steps = 0
        self.step_number = 0
        self.exp = 0.75 #Exploration factor
        self.substrate_mode = False
        if (self.substrate_mode):
            self.state_dim_x = 33
            self.state_dim_y = 7
        else:
            self.state_dim_x = 20
            self.state_dim_y = 12

        self.last_state = None
        self.last_action = None
        random.seed(0)
        
        self.Q = Deep_QNN(nactions=12, input_size=(self.state_dim_x*self.state_dim_y), max_experiences=500, gamma=0.6, alpha=0.2)
        self.T = TNN(input_size= 4, max_experiences=500, alpha=0.2)

        self.discretization_done = False
        self.n_bins = 10
        self.n_disc_states = (self.n_bins + 3)*(self.n_bins + 3)
        self.phi_mat = np.zeros((self.n_disc_states,12,self.n_disc_states),dtype=int)
        self.U_mat = np.zeros((self.n_disc_states,12,self.n_disc_states),dtype=float)

        self.regularization_constant = 0.4 #For rewards incorporated into transition structure
        self.episodesRun = 2000 #This is just used to generate the file names of the stored data

        #The (probably) incorrect U_mat that Peeyush writes in his algo
        self.Peey_U_mat = np.zeros((self.n_disc_states,12,self.n_disc_states),dtype=float)

        self.last_enc_state = None
        self.last_enc_action = None

        self.seen_expanded_states = {} #A dictionary of all the non-approximated (original Mario input) states that were seen.

        #####################################################################

        # Obtain T & P matrices

        self.using_compressed_mats = True #Set true if we're ignoring discretized states that aren't visited
        if self.using_compressed_mats:
            tmatfile = open('comp_noreward_t_mat' + str(self.episodesRun) + '.dat','r')
        else:
            tmatfile = open('noreward_t_mat' + str(self.episodesRun) + '.dat','r')
        unpickler = pickle.Unpickler(tmatfile)
        self.t_mat = unpickler.load()

        if self.using_compressed_mats:
            pmatfile = open('comp_noreward_p_mat' + str(self.episodesRun) + '.dat','r')
        else:
            pmatfile = open('noreward_p_mat' + str(self.episodesRun) + '.dat','r')
        unpickler = pickle.Unpickler(pmatfile)
        self.p_mat = unpickler.load()

        validstatesfile = open('comp_valid_states'+str(self.episodesRun)+'.dat','r') 
        unpickler = pickle.Unpickler(validstatesfile)
        self.valid_states = unpickler.load()

        # Run PCCA to obtain Chi matrix
        self.clusterer = PCCA(True)
        self.chi_mat = np.mat(self.clusterer.pcca(self.t_mat))

        #Converting to np.mat format for consistency with 3 room code. Doing it only here because pcca needs it as ndarray
        self.t_mat = np.mat(self.t_mat)

        # 0,1,2,..11 - primitive actions, 12... - options
        self.value_function=[(self.chi_mat.shape[1]+self.numActions)*[0.0] for i in range(self.chi_mat.shape[0])]

        self.absStateMembership = []
        self.statesInAbsState = [[] for i in xrange(self.chi_mat.shape[1])]
        for (row_i,row) in enumerate(self.chi_mat):
            self.absStateMembership.append(row.argmax())
            self.statesInAbsState[row.argmax()].append(row_i)

#        self.valid_states =  NOT NECESSARY IN MARIO

        self.connect_mat = self.chi_mat.T*self.t_mat*self.chi_mat
Ejemplo n.º 3
0
import pickle
from pcca import PCCA

f = open('tmatrixperfect.dat', 'r')
a = pickle.Unpickler(f)
tm = a.load()

pccaobj = PCCA(True)
chi_mat = pccaobj.pcca(tm)

print tm.shape
print chi_mat.shape

outfile = open('chi_mat.dat', 'w')
pickle.dump(chi_mat, outfile)
Ejemplo n.º 4
0
class MarioAgent(Agent):

    # related to options
    q_stepsize = 0.1
    q_epsilon = 0.1
    q_gamma = 0.9
    value_function = None
    optionCurrentlyOn = False
    normalizationC = 0.5
    currentOptionTime = 0
    currentOptionReward = 0.0
    numActions = 12
    randGenerator=random.Random()


    def agent_init(self,taskSpecString):
        self.policy_frozen = False
        self.total_steps = 0
        self.trial_start = 0.0
        self.total_steps = 0
        self.step_number = 0
        self.exp = 0.75 #Exploration factor
        self.substrate_mode = False
        if (self.substrate_mode):
            self.state_dim_x = 33
            self.state_dim_y = 7
        else:
            self.state_dim_x = 20
            self.state_dim_y = 12

        self.last_state = None
        self.last_action = None
        random.seed(0)
        
        self.Q = Deep_QNN(nactions=12, input_size=(self.state_dim_x*self.state_dim_y), max_experiences=500, gamma=0.6, alpha=0.2)
        self.T = TNN(input_size= 4, max_experiences=500, alpha=0.2)

        self.discretization_done = False
        self.n_bins = 10
        self.n_disc_states = (self.n_bins + 3)*(self.n_bins + 3)
        self.phi_mat = np.zeros((self.n_disc_states,12,self.n_disc_states),dtype=int)
        self.U_mat = np.zeros((self.n_disc_states,12,self.n_disc_states),dtype=float)

        self.regularization_constant = 0.4 #For rewards incorporated into transition structure
        self.episodesRun = 2000 #This is just used to generate the file names of the stored data

        #The (probably) incorrect U_mat that Peeyush writes in his algo
        self.Peey_U_mat = np.zeros((self.n_disc_states,12,self.n_disc_states),dtype=float)

        self.last_enc_state = None
        self.last_enc_action = None

        self.seen_expanded_states = {} #A dictionary of all the non-approximated (original Mario input) states that were seen.

        #####################################################################

        # Obtain T & P matrices

        self.using_compressed_mats = True #Set true if we're ignoring discretized states that aren't visited
        if self.using_compressed_mats:
            tmatfile = open('comp_noreward_t_mat' + str(self.episodesRun) + '.dat','r')
        else:
            tmatfile = open('noreward_t_mat' + str(self.episodesRun) + '.dat','r')
        unpickler = pickle.Unpickler(tmatfile)
        self.t_mat = unpickler.load()

        if self.using_compressed_mats:
            pmatfile = open('comp_noreward_p_mat' + str(self.episodesRun) + '.dat','r')
        else:
            pmatfile = open('noreward_p_mat' + str(self.episodesRun) + '.dat','r')
        unpickler = pickle.Unpickler(pmatfile)
        self.p_mat = unpickler.load()

        validstatesfile = open('comp_valid_states'+str(self.episodesRun)+'.dat','r') 
        unpickler = pickle.Unpickler(validstatesfile)
        self.valid_states = unpickler.load()

        # Run PCCA to obtain Chi matrix
        self.clusterer = PCCA(True)
        self.chi_mat = np.mat(self.clusterer.pcca(self.t_mat))

        #Converting to np.mat format for consistency with 3 room code. Doing it only here because pcca needs it as ndarray
        self.t_mat = np.mat(self.t_mat)

        # 0,1,2,..11 - primitive actions, 12... - options
        self.value_function=[(self.chi_mat.shape[1]+self.numActions)*[0.0] for i in range(self.chi_mat.shape[0])]

        self.absStateMembership = []
        self.statesInAbsState = [[] for i in xrange(self.chi_mat.shape[1])]
        for (row_i,row) in enumerate(self.chi_mat):
            self.absStateMembership.append(row.argmax())
            self.statesInAbsState[row.argmax()].append(row_i)

#        self.valid_states =  NOT NECESSARY IN MARIO

        self.connect_mat = self.chi_mat.T*self.t_mat*self.chi_mat

    def egreedy(self, state):
        maxIndex=0
        a=1
        if self.randGenerator.random()<self.q_epsilon:
            return self.randGenerator.randint(0,self.chi_mat.shape[1]+self.numActions-1)

        temp = [(i,v) for i,v in enumerate(self.value_function[state])]
        random.shuffle(temp)
        a = max(temp,key=itemgetter(1))[0]

        return a

    def agent_start(self,observation):
        self.step_number = 0
        self.trial_start = time.clock()

        # When learning to play with options
        if self.option_learning_frozen == False:
            self.optionCurrentlyOn = False
            theState=self.getDiscretizedState(tuple(self.Q.getHiddenLayerRepresentation(self.stateEncoder(observation))))
            s = theState #valid_states is not needed in Mario
#            s = self.valid_states.index(theState) # row index

            # Choose either a primitive action or option
            a = self.egreedy(s)

            if a<self.numActions:
                # Primitive action
                thisIntAction = a
                self.optionCurrentlyOn = False
                print 'Primitive action chosen'

            else:    
                # Composing an option from S_i to S_j
                self.optionCurrentlyOn = True
                self.currentOptionTime = 0
                self.curentOptionStartState = s
                self.currentOptionReward = 0.0

                # 1. Find the abstract state you belong to & going to
                self.option_S_i = self.absStateMembership[s] # initiation step
                self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP

                #print 'Shape of first term: ',self.p_mat[s][0].shape
                #print self.option_S_j
                #print 'Shape of second term: ', (self.chi_mat.T[self.option_S_j]).T.shape

                #print 'Debug:'
                #print self.chi_mat[0,0]

                # 2. Choose action based on membership ascent
                thisIntAction=1
                maxVal = 0
                for a in xrange(self.numActions): 
                    print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                    action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                    if action_pref > maxVal:
                        thisIntAction = a
                        maxVal = action_pref
                    print 'Option chosen'

                self.currentOptionTime += 1

            print 'Action chosen: ',thisIntAction

            act=self.actionDecoder(thisIntAction)

        # When learning transition probabilities
        if self.transition_learning_frozen == False:
            act = self.getAction(observation)
            if self.discretization_done:
                enc_state = tuple(self.Q.getHiddenLayerRepresentation(self.stateEncoder(observation)))
                self.last_disc_state = self.getDiscretizedState(enc_state)
                self.last_enc_action = self.actionEncoder(act)

        # When using QNN
        if self.policy_frozen == False:
            self.seen_expanded_states[tuple(self.stateEncoder(observation))] = True
            act = self.getAction(observation)

        self.last_action = copy.deepcopy(act)
        self.last_state  = copy.deepcopy(observation)    

        return act
 
    def agent_step(self,reward, observation):
        self.step_number += 1
        self.total_steps += 1

        # When learning to play with options
        if self.option_learning_frozen == False:        
            newState = self.getDiscretizedState(tuple(self.Q.getHiddenLayerRepresentation(self.stateEncoder(observation))))
            lastState = self.getDiscretizedState(tuple(self.Q.getHiddenLayerRepresentation(self.stateEncoder(self.last_state))))
            lastAction = self.last_action.intArray[0]

            s = newState
#            s = self.valid_states.index(newState) # row index

            # Check if an option is going on
            if self.optionCurrentlyOn:

                # add reward to ongoing option
                self.currentOptionReward += reward

                # Decide whether to terminate option
                beta = min(math.log(self.chi_mat[s,self.option_S_i])/math.log(self.chi_mat[s,self.option_S_j]),1)
                if self.randGenerator.random() < beta:
                    self.optionCurrentlyOn = False
                    print 'Terminated option'

                    # Update Q value of terminated option
                    Q_sa=self.value_function[self.curentOptionStartState][self.numActions+self.option_S_j] # 4... - options
                    max_Q_sprime_a = max(self.value_function[newState])     
                    new_Q_sa=Q_sa + self.q_stepsize  * (self.currentOptionReward + math.pow(self.q_gamma,self.currentOptionTime) * max_Q_sprime_a - Q_sa)
                    self.value_function[self.curentOptionStartState][self.numActions+self.option_S_j]=new_Q_sa                

                    # Choose either a primitive action or option
                    a = self.egreedy(s)

                    if a<self.numActions:
                        # Primitive action
                        newIntAction = a
                        self.optionCurrentlyOn = False
                        print 'Primitive action chosen'

                    else:    
                        # Composing an option from S_i to S_j
                        self.optionCurrentlyOn = True
                        self.currentOptionTime = 0
                        self.curentOptionStartState = s
                        self.currentOptionReward = 0.0

                        # 1. Find the abstract state you belong to & going to
                        self.option_S_i = self.absStateMembership[s] # initiation step
                        self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP

                        # 2. Choose action based on membership ascent
                        newIntAction=1
                        maxVal = 0
                        for a in xrange(self.numActions): 
                            print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                            action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                            if action_pref > maxVal:
                                newIntAction = a
                                maxVal = action_pref
                            print 'Option chosen'
                        self.currentOptionTime += 1

                else:
                    # If not terminated, choose action based on membership ascent
                    newIntAction=1
                    maxVal = 0
                    for a in xrange(self.numActions): 
                        print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                        action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                        if action_pref > maxVal:
                            newIntAction = a
                            maxVal = action_pref
                    self.currentOptionTime += 1

            # No option currently running
            else:

                # update Q-value of last primitive action
                Q_sa=self.value_function[lastState][lastAction]
                max_Q_sprime_a = max(self.value_function[newState])     
                new_Q_sa=Q_sa + self.q_stepsize  * (reward + self.q_gamma * max_Q_sprime_a - Q_sa)
                self.value_function[lastState][lastAction]=new_Q_sa

                # Choose either a primitive action or option
                a = self.egreedy(s)

                if a<self.numActions:
                    # Primitive action
                    newIntAction = a
                    self.optionCurrentlyOn = False
                    print 'Primitive action chosen'

                else:    
                    # Composing an option from S_i to S_j
                    self.optionCurrentlyOn = True
                    self.currentOptionTime = 0
                    self.curentOptionStartState = s
                    self.currentOptionReward = 0.0

                    # 1. Find the abstract state you belong to & going to
                    self.option_S_i = self.absStateMembership[s] # initiation step
                    self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP

                    # 2. Choose action based on membership ascent
                    newIntAction=1
                    maxVal = 0
                    for a in xrange(self.numActions): 
                        print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                        action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                        if action_pref > maxVal:
                            newIntAction = a
                            maxVal = action_pref
                        print 'Option chosen'
                    self.currentOptionTime += 1
            
            print 'Action chosen: ',newIntAction

            act = self.actionDecoder(newIntAction)

        # When learning transition probabilities
        if self.transition_learning_frozen == False:
            act = self.getAction(observation)
            enc_state = tuple(self.Q.getHiddenLayerRepresentation(self.stateEncoder(observation)))
            disc_state = self.getDiscretizedState(enc_state)
            enc_action = self.actionEncoder(act)        
            self.phi_mat[self.last_disc_state][self.last_enc_action][disc_state] += 1
            self.U_mat[self.last_disc_state][self.last_enc_action][disc_state] += math.exp(-self.regularization_constant*abs(reward))
            self.Peey_U_mat[self.last_disc_state][self.last_enc_action][disc_state] += self.phi_mat[self.last_disc_state][self.last_enc_action][disc_state]*math.exp(-self.regularization_constant*abs(reward))
            """
            if disc_state not in self.:
                self.transition_matrix[disc_state] = {}
            if self.disc_enc_action:
                if self.last_enc_action not in self.transition_matrix[self.last_enc_state]:
                    self.transition_matrix[self.last_enc_state][self.last_enc_action] = {enc_state:1}
                elif enc_state not in self.transition_matrix[self.last_enc_state][self.last_enc_action]:
                    self.transition_matrix[self.last_enc_state][self.last_enc_action][enc_state] = 1
                else:
                    self.transition_matrix[self.last_enc_state][self.last_enc_action][enc_state] += 1
            """
            self.last_enc_state = enc_state
            self.last_enc_action = enc_action
            self.last_disc_state = disc_state

        # When using QNN
        if self.policy_frozen == False:
            act = self.getAction(observation)    
            self.seen_expanded_states[tuple(self.stateEncoder(observation))] = True
            self.update(observation, act, reward)

        self.last_action = copy.deepcopy(act)
        self.last_state  = copy.deepcopy(observation)    

        return act
    
    def agent_end(self,reward):
        time_passed = time.clock() - self.trial_start
        print "ended after " + str(self.total_steps) + " total steps"
        print "average " + str(self.step_number/time_passed) + " steps per second"
    
        # When learning to play using options
        if self.option_learning_frozen == False:
            lastState = self.getDiscretizedState(tuple(self.Q.getHiddenLayerRepresentation(self.stateEncoder(self.last_state))))
            lastAction = self.last_action.intArray[0]

            if self.optionCurrentlyOn:
                self.currentOptionReward += reward
                # Update Q value of terminated option
                Q_sa=self.value_function[self.curentOptionStartState][self.numActions+self.option_S_j] # 4... - options 
                new_Q_sa=Q_sa + self.q_stepsize  * (self.currentOptionReward - Q_sa)
                self.value_function[self.curentOptionStartState][self.numActions+self.option_S_j]=new_Q_sa                
            else:
                # update Q-value of last primitive action
                Q_sa=self.value_function[lastState][lastAction]    
                new_Q_sa=Q_sa + self.q_stepsize  * (reward - Q_sa)
                self.value_function[lastState][lastAction]=new_Q_sa


    def agent_cleanup(self):
        pass
    
    def agent_freeze(self):
        pass
    
    def agent_message(self,inMessage):
        if inMessage.startswith("freeze_learning"):
            self.policy_frozen=True
            return "message understood, policy frozen"
        if inMessage.startswith("unfreeze_learning"):
            self.policy_frozen=False
            return "message understood, policy unfrozen"
        if inMessage.startswith("set_exploring"):
            splitString=inMessage.split(" ")
            self.exp = float(splitString[1])
            return "message understood, setting exploration factor"
        if inMessage.startswith("save_policy"):
            splitString=inMessage.split(" ")
            self.saveQFun(splitString[1])
            print "Q function saved."
            return "message understood, saving policy"
        if inMessage.startswith("load_policy"):
            splitString=inMessage.split(" ")
            self.loadQFun(splitString[1])
            print "Q function loaded."
            return "message understood, loading policy"
        if inMessage.startswith("use_impactful_experiences"):
            self.Q.use_impactful = True
            return "message understood, using impactful experiences"
        if inMessage.startswith("use_all_experiences"):
            self.Q.use_impactful = False
            return "message understood, using all experiences"
        if inMessage.startswith("reset_q"):
            self.Q = QNN(nactions=12, input_size=(self.state_dim_x*self.state_dim_y), max_experiences=500, gamma=0.6, alpha=0.2)
            return "message understood, reseting q-function"

        # Added now
        if inMessage.startswith("freeze_transition_learning"):
            self.transition_learning_frozen=True
            return "message understood, transition learning frozen"    
        if inMessage.startswith("unfreeze_transition_learning"):
            self.transition_learning_frozen=False
            return "message understood, transition learning unfrozen"        
        if inMessage.startswith("freeze_option_learning"):
            self.option_learning_frozen=True
            return "message understood, option learning frozen"    
        if inMessage.startswith("unfreeze_option_learning"):
            self.option_learning_frozen=False
            self.loadBinsFromDumpFiles()
            return "message understood, option learning unfrozen"   
        if inMessage.startswith("train_TNN"):
            print 'Training TNN'
            self.trainTNN()
        if inMessage.startswith("savetransmatrix"):
            splitString=inMessage.split(" ")
            self.saveTprobs(splitString[1])
            print "Saved.";
            return "message understood, saving Tprobs"
        if inMessage.startswith("loadtransmatrix"):
            splitString=inMessage.split(" ")
            self.loadTprobs(splitString[1])
            print "Loaded.";
            return "message understood, loading Tprobs"
        if inMessage.startswith("save_state_reps"): #Save the tuples corresponding to each state.
            enc_states = []
            for state in self.seen_expanded_states.keys():
                enc_states.append(tuple(self.Q.getHiddenLayerRepresentation(list(state))))
            splitstring = inMessage.split()
            outfile = open(splitstring[1],'w')
            pickle.dump(enc_states,outfile)

        #Once we have frozen our state reps, this function discretizes them and populates the self.secondColBins and self.thirdColBins arrays
        if inMessage.startswith("get_bins_from_state_reps"):
            enc_states = []
            for state in self.seen_expanded_states.keys():
                curr_rep = tuple(self.Q.getHiddenLayerRepresentation(list(state)))
                enc_states.append([curr_rep[0][0], curr_rep[1][0], curr_rep[2][0]])
            enc_states = np.array(enc_states)
            second_col = enc_states[:,1]
            self.secondColBins = self.getBins(second_col)
            third_col = enc_states[:,2]
            self.thirdColBins = self.getBins(third_col)

            splitstring = inMessage.split()
            outfile2 = open(splitstring[1],'w')
            pickle.dump(self.secondColBins,outfile2)

            outfile3 = open(splitstring[2],'w')
            pickle.dump(self.thirdColBins,outfile3)

            self.discretization_done = True

        if inMessage.startswith("save_phi"):
            splitstring = inMessage.split()
            phioutfile = open(splitstring[1],'w')
            pickle.dump(self.phi_mat,phioutfile)

            uoutfile = open(splitstring[2],'w')
            pickle.dump(self.U_mat,uoutfile)

            puoutfile = open(splitstring[3],'w')
            pickle.dump(self.Peey_U_mat,puoutfile)
        return None


    def loadBinsFromDumpFiles(self):
        secondcolfile = open('secondcolbins' + str(self.episodesRun) + '.dat','r')
        unpickler = pickle.Unpickler(secondcolfile)
        self.secondColBins = unpickler.load()

        thirdcolfile = open('thirdcolbins' + str(self.episodesRun) + '.dat','r')
        unpickler = pickle.Unpickler(thirdcolfile)
        self.thirdColBins = unpickler.load()


    #Get the discretized states. Assumes that the bins are available.
    def getDiscretizedState(self,enc_state):
        
        second_elem = enc_state[1][0]
        third_elem = enc_state[2][0]
        disc_tuple = (np.digitize([second_elem],self.secondColBins)[0],np.digitize([third_elem],self.thirdColBins)[0])
        
        flat_disc_state = (self.n_bins+3)*disc_tuple[0]+disc_tuple[1]
        if self.using_compressed_mats:
            return self.valid_states.index(flat_disc_state) 
        else:
            return flat_disc_state

    def getBins(self,i_array):
        bins = [0.0]
        i_array.sort()

        i = 0
        while i < len(i_array):
            bins.append(i_array[i])
            i += len(i_array)/self.n_bins
        bins.append(1.0)
        return np.array(bins)
                
    def getMonsters(self, observation):
        monsters = []
        i = 0
        while (1+2*i < len(observation.intArray)):
            m = Monster()
            m.type = observation.intArray[1+2*i]
            m.winged = (observation.intArray[2+2*i]) != 0
            m.x = observation.doubleArray[4*i];
            m.y = observation.doubleArray[4*i+1];
            m.sx = observation.doubleArray[4*i+2];
            m.sy = observation.doubleArray[4*i+3];
            m.typeName = monsterNames[m.type]
            monsters.append(m)
            i += 1
        return monsters

    def getMario(self, observation):
        monsters = self.getMonsters(observation)
        for i in xrange(len(monsters)):
            if (monsters[i].type == 0 or monsters[i].type == 10 or monsters[i].type == 11):
                return (monsters[i].x-observation.intArray[0], monsters[i].y)
        return None

    def getMarioFromTiles(self, observation):
        for xi in xrange(22):
            for yi in xrange(16):
                if (self.getTileAt(xi, yi, observation) == 'M'):
                    return (xi, yi)
        return None

    def getTileAt(self, x, y, observation):
        if (x < 0):
            return '7'
        y = 16 - y
        x -= observation.intArray[0]
        if (x<0 or x>21 or y<0 or y>15):
            return '\0';
        index = y*22+x;
        return observation.charArray[index];        

    #Handy little function for debugging by printing out the char array
    def printFullState(self, observation):
        print "-----------------"
        for yi in xrange(16):
            out_string = ""
            for xi in xrange(22):
                out_string += observation.charArray[yi*22+xi]
            print out_string

    #Handy little functio for debugging by printing out Mario's (x,y) position
    def printMarioState(self, observation):
        mar = self.getMario(observation)
        print "Mario X: " + str(mar[0]) + " Mario Y: " + str(mar[1])

    #Handy little function for debugging by printing out the encoded state that is being send to the NN
    def printEncodedState(self, s):
        print "-----------------"
        for yi in xrange(self.state_dim_y):
            out_string = ""
            for xi in xrange(self.state_dim_x):
                out_string += (str(s[yi*self.state_dim_x+xi]) + " ")
            print out_string

    def getAction(self, observation):
        act = self.qnnAction(observation)
        return act

    def update(self, observation, action, reward):
        self.qnnUpdate(observation, action, reward)
        self.Q.ExperienceReplay()

    def stateEncoder(self, observation):
        if (not self.substrate_mode):
            return self.stateEncoderSingle(observation)
        else:
            return self.stateEncoderMultiple(observation)
        pass

    def stateEncoderSingle(self, observation):
        s = []
        #Determine Mario's current position. Everything is relative to Mario
        mar = self.getMario(observation)
        mx = int(mar[0])
        my = 15 - int(mar[1])
        #Update based on the environment
        for yi in xrange(self.state_dim_y):
            for xi in xrange(self.state_dim_x):
                x = mx + xi - int(self.state_dim_x/2.0)
                y = my + yi - int(self.state_dim_y/2.0)
                if (x < 0 or x > 21 or y < 0 or y > 15):
                    s.append(-1)
                    continue
                s.append(tileEncoder[observation.charArray[y*22+x]]) # shouldn't this be 20
        #Add monsters
        monsters = self.getMonsters(observation)
        for mi in xrange(len(monsters)):
            if (monsters[mi].type == 0 or monsters[mi].type == 10 or monsters[mi].type == 11): #skip mario
                continue
            monx = int(monsters[mi].x)
            mony = 15 - int(monsters[mi].y)
            x = monx - mx + int(self.state_dim_x/2.0) - observation.intArray[0]
            y = mony - my + int(self.state_dim_y/2.0)
            if (x < 0 or x >= self.state_dim_x or y < 0 or y >= self.state_dim_y): #skip monsters farther away
                continue
            s[y*self.state_dim_x + x] = -2
        return s

    def stateEncoderMultiple(self, observation):
        s1 = []
        s2 = []
        s3 = []
        #Determine Mario's current position. Everything is relative to Mario
        mar = self.getMario(observation)
        mx = int(mar[0])
        my = 15 - int(mar[1])
        #Add background and reward layers with a placeholder monster layer
        for yi in xrange(self.state_dim_y):
            for xi in xrange(self.state_dim_x/3):
                x = mx + xi - int(self.state_dim_x/6.0)
                y = my + yi - int(self.state_dim_y/2.0)
                if (x < 0 or x > 21 or y < 0 or y > 15):
                    s1.append(0)
                    s2.append(0)
                    s3.append(0)
                    continue
                s1.append(backgroundLayer[observation.charArray[y*22+x]])
                s2.append(rewardLayer[observation.charArray[y*22+x]])
                s3.append(0)
        #Add monsters to the monster layer
        monsters = self.getMonsters(observation)
        for mi in xrange(len(monsters)):
            if (monsters[mi].type == 0 or monsters[mi].type == 10 or monsters[mi].type == 11):#skip mario
                continue
            monx = int(monsters[mi].x)
            mony = 15 - int(monsters[mi].y)
            x = monx - mx + int(self.state_dim_x/6.0) - observation.intArray[0]
            y = mony - my + int(self.state_dim_y/2.0)
            if (x < 0 or x >= self.state_dim_x/3 or y < 0 or y >= self.state_dim_y): #skip monsters farther away
                continue
            s3[y*self.state_dim_x/3 + x] = 1
        return s1 + s2 + s3

    def actionEncoder(self, act):
        a = 1*act.intArray[2] + 2*act.intArray[1] + 4*(act.intArray[0]+1)
        return a

    def actionDecoder(self, a):
        act = Action()
        act.intArray.append(int(a/4.0) - 1)
        act.intArray.append(int(a/2.0) % 2)
        act.intArray.append(a % 2)
        return act

    def randomAction(self, forwardBias):
        act = Action()
        #The first control input is -1 for left, 0 for nothing, 1 for right
        if (not forwardBias or random.random() < 0.1):
            act.intArray.append(random.randint(-1,1))
        else:
            act.intArray.append(random.randint(0,1))
        #The second control input is 0 for nothing, 1 for jump
        act.intArray.append(random.randint(0,1))
        #The third control input is 0 for nothing, 1 for speed increase
        act.intArray.append(random.randint(0,1))        
        return act

    def qnnAction(self, observation):
        s = self.stateEncoder(observation)
        #self.printEncodedState(s)
        if (random.random()>self.exp):
            a = np.argmax(self.Q(s))
            act = self.actionDecoder(a)
        else:
            act = self.randomAction(True)
        return act

    def qnnUpdate(self, observation, action, reward):
        if (self.last_state == None or self.last_action == None):
            return
        s = self.stateEncoder(observation)
        ls = self.stateEncoder(self.last_state)
        a = self.actionEncoder(action)
        la = self.actionEncoder(self.last_action)
        self.Q.RememberExperience(ls, la, reward, s, a)

    def saveQFun(self, fileName):
        theFile = open(fileName, "w")
        pickle.dump(self.Q, theFile)
        theFile.close()

    def loadQFun(self, fileName):
        theFile = open(fileName, "r")
        self.Q = pickle.load(theFile)
        theFile.close()

    # Added now
    def train_TNN(self):
        for s1 in self.transition_probs:
            for a in self.transition_probs[s1]:
                for s2 in self.transition_probs[s1][a]:
                    self.TNN.Update(np.asarray(s1),np.asarray(s2),self.transition_probs[s1][a][s2])

    # TODO - take the current transition_matrix and compute transition_probs (&save it)
    def saveTprobs(self, fileName):
        transition_probs = {}
        for s1 in self.transition_matrix.keys():
            transition_probs[s1] = {}
            for a in self.transition_matrix[s1].keys():
                transition_probs[s1][a] = {}
                Z = 0 #Normalisation constant
                for s2 in self.transition_matrix[s1][a].keys():
                    Z += self.transition_matrix[s1][a][s2]
                for s2 in self.transition_matrix[s1][a].keys():
                    transition_probs[s1][a][s2] = float(self.transition_matrix[s1][a][s2])/float(Z)
        theFile = open(fileName,'w')
        pickle.dump(transition_probs,theFile)

    def loadTprobs(self, fileName):
        theFile = open(fileName, "r")
        self.transition_probs = pickle.load(theFile)
        theFile.close()
Ejemplo n.º 5
0
    def agent_init(self,taskSpecString):
        self.policy_frozen = False
        self.total_steps = 0
        self.trial_start = 0.0
        self.total_steps = 0
        self.step_number = 0
        self.exp = 0.75 # Exploration factor, reset from param
        self.substrate_mode = False
        if (self.substrate_mode):
            self.state_dim_x = 33
            self.state_dim_y = 7
        else:
            self.state_dim_x = 20
            self.state_dim_y = 12

        self.last_state = None
        self.last_action = None
        random.seed(0)
        
        self.Q = Deep_QNN(nactions=12, input_size=(self.state_dim_x*self.state_dim_y), max_experiences=500, gamma=0.6, alpha=0.2)

        #####
        self.regularization_constant = 0.4 # For rewards incorporated into transition structure
        self.episodesRun = 100000 # This is just used to generate the file names of the stored data to be read in
        ######

        self.n_bins = 1000 # filled in from experiment.py
        self.n_hidden_layer_outputs = 1 # filled in from experiment.py
        self.option_learning_frozen = True # filled in from experiment.py

        self.discretization_done = False
        self.n_disc_states = math.pow((self.n_bins + 3),self.n_hidden_layer_outputs)
        self.phi_mat = {}
        self.U_mat = {}
        self.Peey_U_mat = {} # The (probably) incorrect U_mat that Peeyush writes in his algo
        self.seen_expanded_states = {} # A dictionary of all the non-approximated (original Mario input) states that were seen.
        self.last_enc_state = None
        self.last_enc_action = None

        #####################################################################

        # When learning to play with options
        if self.option_learning_frozen==False:
            # Obtain T & P matrices
            tmatfile = open('t_mat' + str(self.episodesRun) + '.dat','r')
            unpickler = pickle.Unpickler(tmatfile)
            self.t_mat = unpickler.load()

            pmatfile = open('p_mat' + str(self.episodesRun) + '.dat','r')
            unpickler = pickle.Unpickler(pmatfile)
            self.p_mat = unpickler.load()

            validstatesfile = open('valid_states'+str(self.episodesRun)+'.dat','r')
            unpickler = pickle.Unpickler(validstatesfile)
            self.valid_states = unpickler.load()

            # Run PCCA to obtain Chi matrix
            self.clusterer = PCCA(True)
            self.chi_mat = np.mat(self.clusterer.pcca(self.t_mat))

            #Converting to np.mat format for consistency with 3 room code. Doing it only here because pcca needs it as ndarray
            self.t_mat = np.mat(self.t_mat)

            # 0,1,2,..11 - primitive actions, 12... - options
            self.value_function=[(self.chi_mat.shape[1]+self.numActions)*[0.0] for i in range(self.chi_mat.shape[0])]

            self.absStateMembership = []
            self.statesInAbsState = [[] for i in xrange(self.chi_mat.shape[1])]
            for (row_i,row) in enumerate(self.chi_mat):
                self.absStateMembership.append(row.argmax())
                self.statesInAbsState[row.argmax()].append(row_i)

            self.connect_mat = self.chi_mat.T*self.t_mat*self.chi_mat