Exemple #1
0
class StateValueColl(object):
    def __init__(self, environment, init_val=0.0):
        """
        A Collection of State-Value, V(s) for each state in the environment
        
        Each Update to V(s) is done with a learning-rate (alpha)
        
        To get value use:
        sv.get_Vs( s_hash ):
        
        To update value use:
        sv.delta_update( s_hash, delta)
        sv.mc_update( s_hash, alpha, G)
        sv.td0_update( s_hash, alpha, gamma, sn_hash, reward)
        etc.
        
        (Terminal States have V(s) = 0.0)
        
        Transition probabilities from (s,a) to (sn,reward) are collected
        as updates happen.
        """

        self.environment = environment

        # check to see if the environment already contains trasition probabilities
        if hasattr(environment, 'iter_next_state_prob_reward'):
            self.env_has_transition_prob = True
        else:
            self.env_has_transition_prob = False

        self.VsD = {}  # index=state_hash, value=state value, Vs (a float)

        self.define_statesD = {
        }  # index=s_hash: value=ModelStateData object for s_hash

        # (used in error estimate)
        # Monte Carlo = Gt, discounted return
        # TD(0) = Rt+1 + gamma*V(st+1) (estimated discounted return)

        self.last_delta_VsD = {}  # index=s_hash value=last change to s_hash
        self.chgTracker = ChangeTracker()

        self.init_Vs_to_val(init_val)
        self.init_val = init_val

        self.min_target = None  # initialize when 1st target is submitted
        self.max_target = None

    def num_Vs(self):
        return len(self.VsD)

    def init_Vs_to_val(self, init_val):
        # initialize to init_val for all states, terminal = 0.0
        for s_hash in self.environment.iter_all_states():
            self.last_delta_VsD[s_hash] = 0.0  # record last change as 0.0

            if s_hash in self.environment.terminal_set:
                self.VsD[s_hash] = 0.0
            else:
                self.VsD[s_hash] = init_val

    def get_best_env_action(self, s_hash, a_descL):
        """Given env_has_transition_prob == True, find best action from given list."""

        VsD = {
        }  # will hold: index=a_desc, value=V(s) for all transitions of a from s

        # iterate over all actions from s, MUST include zero prob actions
        for a_desc in a_descL:
            calcd_v = 0.0
            # iterate over the probability of going to next state, sn when action, a  is taken
            for sn_hash, t_prob, reward in \
                self.environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False):

                # use probability-averaged V(sn) values from state_value_coll
                calcd_v += t_prob * (reward + self.VsD[sn_hash])

            VsD[a_desc] = calcd_v

        best_a, best_a_val = argmax_vmax_dict(VsD)
        return best_a, best_a_val

    def get_best_blackbox_action(self, s_hash, a_descL):
        """Given env_has_transition_prob == False, find best action from given list."""

        if s_hash in self.define_statesD:  # index=s_hash: value=ModelStateData object for s_hash

            VsD = {
            }  # will hold: index=a_desc, value=V(s) for all transitions of a from s
            PD = self.define_statesD[s_hash]

            for a_desc in a_descL:
                # select any actions not yet taken to start getting trasition data
                if a_desc not in PD.action_sn_rD:
                    return a_desc, self.init_val  # <--- Jumps the line to return unused actions.

                # if the action is deterministic (so far), just look up the current V(s)
                if PD.is_deterministic_action(a_desc):
                    snD = PD.action_sn_rD[a_desc]
                    sn_hash = tuple(snD.keys())[0]
                    rwd_ave_obj = snD[sn_hash]
                    VsD[a_desc] = rwd_ave_obj.get_ave() + self.VsD[sn_hash]
                else:
                    # for stochastic actions, do a transition probability weighted calc of V(s)
                    calcd_v = 0.0
                    a_count = PD.action_countD.get(
                        a_desc,
                        0)  # index=a_desc: value=count of (s,a) occurances
                    if a_count > 0:

                        if a_desc in PD.action_sn_rD:
                            snD = PD.action_sn_rD[
                                a_desc]  # snD...  index=sn_hash: value=rwd_ave_obj
                            for sn_hash, rwd_ave_obj in snD.items():

                                # fraction of times using a_desc in s_hash resulted in sn_hash
                                t_prob = float(
                                    rwd_ave_obj.num_val) / float(a_count)
                                calcd_v += t_prob * (rwd_ave_obj.get_ave() +
                                                     self.VsD[sn_hash])

                    VsD[a_desc] = calcd_v

            best_a, best_a_val = argmax_vmax_dict(VsD)
            return best_a, best_a_val

        else:
            # this state has not yet been called so initialize transition tracking for it.
            for a_desc in a_descL:
                self.add_action(s_hash, a_desc)

            return a_desc, self.init_val

    def get_best_eps_greedy_action(self, s_hash, epsgreedy_obj=None):
        """
        Pick the best action for state "s_hash" based on max V(s')
        If epsgreedy_obj is given, apply Epsilon Greedy logic to choice.
        """
        a_descL = self.environment.get_state_legal_action_list(s_hash)
        if a_descL:
            if self.env_has_transition_prob:
                best_a_desc, best_a_val = self.get_best_env_action(
                    s_hash, a_descL)
            else:
                best_a_desc, best_a_val = self.get_best_blackbox_action(
                    s_hash, a_descL)

            if epsgreedy_obj is not None:
                best_a_desc = epsgreedy_obj(best_a_desc, a_descL)

            return best_a_desc
        return None

    def record_changes(self, s_hash, delta):
        """Keep track of changes made to V(s) values"""

        delta = abs(delta)  # make sure that only absolute values are saved.

        # remove any record of last change to [s_hash]
        self.chgTracker.dec_change(self.last_delta_VsD[s_hash])

        # add delta to tracking record
        self.chgTracker.inc_change(delta)

        # remember that delta was last change to  [s_hash]
        self.last_delta_VsD[s_hash] = delta

    def get_snapshot(self):
        """
        return a deep copy of the value dictionary.
        index=state_hash, value=state value, Vs (a float)
        """
        return copy.deepcopy(self.VsD)

    def delta_update(self, s_hash='', delta=0.0):
        """Add delta to current value of s_hash"""
        self.VsD[s_hash] += delta

        self.record_changes(s_hash, delta)

    def add_action(self, s_hash, a_desc):
        """
        Add an action to trasition data with call as follows.
        self.add_action( s_hash, a_desc )
        """
        if s_hash not in self.define_statesD:
            self.define_statesD[s_hash] = ModelStateData(s_hash)
            self.define_statesD[s_hash].add_action(a_desc)

    def save_action_results(self, s_hash, a_desc, sn_hash, reward_val):
        """Add sn_hash to possible next states and add to its RunningAve"""

        self.add_action(s_hash, a_desc)
        self.define_statesD[s_hash].save_action_results(
            a_desc, sn_hash, reward_val)

    def mc_update(self, s_hash='', alpha=0.1, G=0.0):
        """
        Do a Monte-Carlo-style learning rate update.
        V(st) = V(st) + alpha * [Gt - V(st)]
        """
        delta = alpha * (G - self.VsD[s_hash])  # allow key error
        self.VsD[s_hash] += delta

        self.record_changes(s_hash, delta)

        return abs(delta)  # return the absolute value of change

    def td0_update(self,
                   s_hash='',
                   a_desc='',
                   alpha=0.1,
                   gamma=1.0,
                   sn_hash='',
                   reward=0.0):
        """
        Do a TD(0), Temporal-Difference-style learning rate update.
        V(st) = V(st) + alpha * [R + gamma*V(st+1) - V(st)]
        
        Note: the a_desc input is provided in order to collect transition probability data.
        """
        Vstp1 = self.VsD[sn_hash]
        target_val = reward + gamma * Vstp1
        delta = alpha * (target_val - self.VsD[s_hash])  # allow key error

        self.VsD[s_hash] += delta

        self.record_changes(s_hash, delta)

        self.save_action_results(s_hash, a_desc, sn_hash, reward)

        return abs(delta)  # return the absolute value of change

    def get_Vs(self, s_hash):
        """Return the current State-Value for s_hash"""
        return self.VsD[s_hash]  # Allow key error

    def set_Vs(self, s_hash, Vs):
        """Set the current State-Value for s_hash"""
        self.VsD[s_hash] = Vs

    def calc_rms_error(self, true_valueD):
        """Using the dictionary, true_valueD as reference, calc RMS error."""
        diff_sqL = []
        for s_hash, true_val in true_valueD.items():
            diff_sqL.append((true_val - self.VsD[s_hash])**2)
        rms = sqrt(sum(diff_sqL) / len(diff_sqL))
        return rms

    def get_biggest_action_state_err(self):
        """Estimate the biggest error in all the state values."""
        #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change())
        return self.chgTracker.get_biggest_change()

    def make_pickle_filename(self, fname):
        """Make a file name ending with .vs2_pickle """
        if fname is None:
            fname = self.name.replace(' ', '_') + '.vs2_pickle'

        else:
            fname = fname.replace(' ', '_').replace('.', '_') + '.vs2_pickle'

        return fname

    def save_to_pickle_file(self, fname=None):  # pragma: no cover
        """Saves data to pickle file."""
        # build name for pickle
        fname = self.make_pickle_filename(fname)

        saveD = {}
        saveD['VsD'] = self.VsD
        savedD['define_statesD'] = self.define_statesD

        fileObject = open(fname, 'wb')
        pickle.dump(saveD, fileObject,
                    protocol=2)  # protocol=2 is python 2&3 compatible.
        fileObject.close()
        print('Saved StateValueColl to file:', fname)

    def read_pickle_file(self, fname=None):  # pragma: no cover
        """Reads data from pickle file."""

        fname = self.make_pickle_filename(fname)
        if not os.path.isfile(fname):
            print('Pickle File NOT found:', fname)
            return False

        fileObject = open(fname, 'rb')
        readD = pickle.load(fileObject)

        VsD = readD['VsD']
        define_statesD = readD['define_statesD']

        fileObject.close()
        print('Read StateValueColl from file:', fname)

        return VsD, define_statesD

    def init_from_pickle_file(self, fname=None):  # pragma: no cover
        """Initialize StateValueColl from policy pickle file."""
        VsD, define_statesD = self.read_pickle_file(fname=fname)
        if VsD:
            self.VsD = VsD
            self.define_statesD = define_statesD

        self.chgTracker.clear()

    def get_policy(self):

        policy = Policy(environment=self.environment)
        for s_hash in self.environment.iter_all_action_states():
            a_desc = self.get_best_eps_greedy_action(s_hash,
                                                     epsgreedy_obj=None)
            policy.set_sole_action(s_hash, a_desc)
        return policy

    def summ_print(self,
                   fmt_V='%g',
                   none_str='*',
                   show_states=True,
                   show_last_change=True,
                   show_policy=True):
        print()
        print('___ "%s" Alpha-Based State-Value Summary ___' %
              self.environment.name)

        if self.environment.layout is not None:
            # make summ_print using environment.layout
            if show_states:
                self.environment.layout.s_hash_print(none_str='*')

            row_tickL = self.environment.layout.row_tickL
            col_tickL = self.environment.layout.col_tickL
            x_axis_label = self.environment.layout.x_axis_label
            y_axis_label = self.environment.layout.y_axis_label

            rows_outL = []
            last_delta_rows_outL = []  # if show_last_change == True
            for row in self.environment.layout.s_hash_rowL:
                outL = []
                ld_outL = []
                ld_outL.append(none_str)
                for s_hash in row:
                    if not self.environment.is_legal_state(s_hash):
                        if is_literal_str(s_hash):
                            outL.append(s_hash[1:-1])
                            ld_outL.append(s_hash[1:-1])
                        else:
                            outL.append(none_str)
                            ld_outL.append(none_str)
                    else:
                        outL.append(fmt_V % self.VsD[s_hash])
                        delta = self.last_delta_VsD.get(s_hash, None)
                        if delta is None:
                            ld_outL.append('None')
                        else:
                            ld_outL.append(fmt_V % delta)

                rows_outL.append(outL)
                last_delta_rows_outL.append(ld_outL)

            print_string_rows(rows_outL,
                              row_tickL=row_tickL,
                              const_col_w=True,
                              line_chr='_',
                              left_pad='    ',
                              col_tickL=col_tickL,
                              header=self.environment.name +
                              ' State-Value Summary, V(s)',
                              x_axis_label=x_axis_label,
                              y_axis_label=y_axis_label,
                              justify='right')
            if show_last_change:
                print_string_rows(last_delta_rows_outL,
                                  row_tickL=row_tickL,
                                  const_col_w=True,
                                  line_chr='_',
                                  left_pad='    ',
                                  col_tickL=col_tickL,
                                  header=self.environment.name +
                                  ' Last Change to V(s) Summary',
                                  x_axis_label=x_axis_label,
                                  y_axis_label=y_axis_label,
                                  justify='right')

            if show_policy:
                policy = self.get_policy()
                policy.summ_print(verbosity=0, environment=self.environment)

        # ------------------------- simple output w/o a layout ------------
        else:
            lmax_hash = 6
            lmax_V = 6

            outL = []  # list of tuples = (s_hash, V)
            for s_hash, V in self.VsD.items():
                outL.append((s_hash, V))

                lmax_hash = max(lmax_hash, len(str(s_hash)))
                lmax_V = max(lmax_V, len(fmt_V % V))

            fmt_hash = '%' + '%is' % lmax_hash
            fmt_strV = '%' + '%is' % lmax_V

            outL.sort()  # sort in-place
            for (s_hash, V) in outL:
                V = fmt_V % V
                print('    ', fmt_hash % str(s_hash), fmt_strV % V, end='')
                if show_last_change:
                    print(' Last Delta = %s' %
                          self.last_delta_VsD.get(s_hash, None))
                else:
                    print()
Exemple #2
0
class Baseline_V_Func(object):
    """
    Create a linear function for an environment that simply one-hot encodes
    all of the states.
    
    OVERRIDE THIS for more interesting linear functions.
    
    This is only interesting for debugging linear function solution routines.
    (i.e. each term in the one-hot encoding should move to near the actual 
    value function)
    """

    # ======================== OVERRIDE STARTING HERE ==========================
    def init_w_vector(self):
        """Initialize the weights vector and the number of entries, N."""

        # initialize a weights numpy array with random values.
        N = len(self.sD)
        self.w_vector = np.random.randn(N) / np.sqrt(N)
        self.N = len(self.w_vector)

    def get_x_vector(self, s_hash):
        """
        Return the x vector that represents the state, s_hash.
        NOTE: the index into x_vector for s_hash = self.sD[ s_hash ]
        """
        x_vector = np.zeros(self.N, dtype=np.float)
        x_vector[self.sD[s_hash]] = 1.0
        return x_vector

    # ======================== OVERRIDE ENDING HERE ==========================

    def VsEst(self, s_hash):
        """Return the current estimate for V(s) from linear function eval."""
        x_vector = self.get_x_vector(s_hash)
        return self.w_vector.dot(x_vector)

    def __init__(self, environment):

        self.environment = environment

        self.chgTracker = ChangeTracker()
        self.init_tracking()

        # initialize a weights numpy array with random values.
        self.init_w_vector()
        # e.g.  self.w_vector = np.random.randn(self.N) / np.sqrt(self.N)
        self.N = len(self.w_vector)

    def init_tracking(self):
        # initialize known states.
        self.sD = {}
        self.last_delta_VsD = {}  # index=s_hash value=last change to s_hash

        # initialize to init_val for all states, terminal = 0.0
        for s_hash in self.environment.iter_all_states():
            # set dict value to index of numpy array
            self.sD[s_hash] = len(self.sD)
            self.last_delta_VsD[s_hash] = 0.0

    def get_number_of_changes(self):
        return self.chgTracker.get_number_of_changes()

    def num_Vs(self):
        return len(self.sD)

    def record_changes(self, s_hash, delta):
        """Keep track of changes made to V(s) values"""

        delta = abs(delta)  # make sure that only absolute values are saved.

        # remove any record of last change to [s_hash]
        self.chgTracker.dec_change(self.last_delta_VsD[s_hash])

        # add delta to tracking record
        self.chgTracker.inc_change(delta)

        # remember that delta was last change to  [s_hash]
        self.last_delta_VsD[s_hash] = delta

    def get_biggest_action_state_err(self):
        """Estimate the biggest error in all the action values."""
        #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change())
        return self.chgTracker.get_biggest_change()

    def get_max_last_delta_overall(self):
        """ get biggest entry in self.last_delta_VsD # index=s_hash value=aD (dict)"""
        d_max = 0.0
        for aD in self.last_delta_VsD.values():
            for val in aD.values():
                d_max = max(d_max, abs(val))
        return d_max

    def get_gradient(self, s_hash):
        """
        Return the gradient of value function with respect to w_vector.
        Since the function is linear in w, the gradient is = x_vector.
        """
        return self.get_x_vector(s_hash)

    def mc_update(self, s_hash='', alpha=0.1, G=0.0):
        """
        Do a Monte-Carlo-style learning rate update.
        w = w + alpha * [Gt - Vhat(st)] * grad(st)
        """
        Vs = self.VsEst(s_hash)
        delta = alpha * (G - Vs)

        delta_vector = delta * self.get_gradient(s_hash)
        self.w_vector += delta_vector

        delta = np.max(np.absolute(delta_vector))
        self.record_changes(s_hash, delta)

        return abs(delta)  # return the absolute value of change

    def td0_update(self,
                   s_hash='',
                   alpha=0.1,
                   gamma=1.0,
                   sn_hash='',
                   reward=0.0):
        """
        Do a TD(0), Temporal-Difference-style learning rate update.
        w = w + alpha * [R + gamma*VEst(s',w) - V(s,w)] * grad(s)
        """
        Vs = self.VsEst(s_hash)

        if sn_hash in self.environment.terminal_set:
            target_val = reward
        else:
            Vstp1 = self.VsEst(sn_hash)
            target_val = reward + gamma * Vstp1

        delta = alpha * (target_val - Vs)

        delta_vector = delta * self.get_gradient(s_hash)
        self.w_vector += delta_vector

        delta = np.max(np.absolute(delta_vector))
        self.record_changes(s_hash, delta)

        return abs(delta)  # return the absolute value of change

    # ========================== pickle routines ===============================

    def make_pickle_filename(self, fname):
        """Make a file name ending with .vlf_pickle """
        if fname is None:
            fname = self.name.replace(' ', '_') + '.vlf_pickle'

        else:
            fname = fname.replace(' ', '_').replace('.', '_') + '.vlf_pickle'

        return fname

    def save_to_pickle_file(self, fname=None):  # pragma: no cover
        """Saves data to pickle file."""
        # build name for pickle
        fname = self.make_pickle_filename(fname)

        saveD = {}
        saveD['sD'] = self.sD
        saveD['last_delta_VsD'] = self.last_delta_VsD
        saveD['w_vector'] = self.w_vector

        fileObject = open(fname, 'wb')
        pickle.dump(saveD, fileObject,
                    protocol=2)  # protocol=2 is python 2&3 compatible.
        fileObject.close()
        print('Saved ActionValueColl to file:', fname)

    def read_pickle_file(self, fname=None):  # pragma: no cover
        """Reads data from pickle file."""

        fname = self.make_pickle_filename(fname)
        if not os.path.isfile(fname):
            print('Pickle File NOT found:', fname)
            return False

        fileObject = open(fname, 'rb')
        readD = pickle.load(fileObject)

        sD = readD['sD']
        last_delta_VsD = readD['last_delta_VsD']
        w_vector = readD['w_vector']

        fileObject.close()
        print('Read ActionValueColl from file:', fname)

        return sD, last_delta_VsD, w_vector

    def init_from_pickle_file(self, fname=None):  # pragma: no cover
        """Initialize ActionValueColl from policy pickle file."""
        sD, last_delta_VsD, w_vector = self.read_pickle_file(fname=fname)
        if sD:
            self.sD = sD
            self.w_vector = w_vector
            self.last_delta_VsD = last_delta_VsD
            self.N = len(self.w_vector)
            self.chgTracker = ChangeTracker()
            self.init_tracking()
        else:
            print('ERROR... Failed to read file:', fname)

    # ========================== summ_print ===============================

    def summ_print(self,
                   fmt_V='%g',
                   none_str='*',
                   show_states=True,
                   show_last_change=True):
        print()
        print('___ "%s" Alpha-Based State-Value Summary ___' %
              self.environment.name)

        if self.environment.layout is not None:
            # make summ_print using environment.layout
            if show_states:
                self.environment.layout.s_hash_print(none_str='*')

            row_tickL = self.environment.layout.row_tickL
            col_tickL = self.environment.layout.col_tickL
            x_axis_label = self.environment.layout.x_axis_label
            y_axis_label = self.environment.layout.y_axis_label

            rows_outL = []
            last_delta_rows_outL = []  # if show_last_change == True
            for row in self.environment.layout.s_hash_rowL:
                outL = []
                ld_outL = []
                ld_outL.append(none_str)
                for s_hash in row:
                    if not self.environment.is_legal_state(s_hash):
                        if is_literal_str(s_hash):
                            outL.append(s_hash[1:-1])
                            ld_outL.append(s_hash[1:-1])
                        else:
                            outL.append(none_str)
                            ld_outL.append(none_str)
                    else:
                        outL.append(fmt_V % self.VsEst(s_hash))
                        delta = self.last_delta_VsD.get(s_hash, None)
                        if delta is None:
                            ld_outL.append('None')
                        else:
                            ld_outL.append(fmt_V % delta)

                rows_outL.append(outL)
                last_delta_rows_outL.append(ld_outL)

            print_string_rows(rows_outL,
                              row_tickL=row_tickL,
                              const_col_w=True,
                              line_chr='_',
                              left_pad='    ',
                              col_tickL=col_tickL,
                              header=self.environment.name +
                              ' State-Value Summary, V(s)',
                              x_axis_label=x_axis_label,
                              y_axis_label=y_axis_label,
                              justify='right')
            if show_last_change:
                print_string_rows(last_delta_rows_outL,
                                  row_tickL=row_tickL,
                                  const_col_w=True,
                                  line_chr='_',
                                  left_pad='    ',
                                  col_tickL=col_tickL,
                                  header=self.environment.name +
                                  ' Last Change to V(s) Summary',
                                  x_axis_label=x_axis_label,
                                  y_axis_label=y_axis_label,
                                  justify='right')

        # ------------------------- simple output w/o a layout ------------
        else:
            lmax_hash = 6
            lmax_V = 6

            outL = []  # list of tuples = (s_hash, V)
            for s_hash, _ in self.VsD.items():
                V = self.VsEst(s_hash)
                outL.append((s_hash, V))

                lmax_hash = max(lmax_hash, len(str(s_hash)))
                lmax_V = max(lmax_V, len(fmt_V % V))

            fmt_hash = '%' + '%is' % lmax_hash
            fmt_strV = '%' + '%is' % lmax_V

            outL.sort()  # sort in-place
            for (s_hash, V) in outL:
                V = fmt_V % V
                print('    ', fmt_hash % str(s_hash), fmt_strV % V, end='')
                if show_last_change:
                    print(' Last Delta = %s' %
                          self.last_delta_VsD.get(s_hash, None))
                else:
                    print()
Exemple #3
0
class Baseline_Q_Func( object ):
    """
    Create a linear function for an environment that simply one-hot encodes
    all of the state-action pairs.
    
    OVERRIDE THIS for more interesting linear functions.
    
    This is only interesting for debugging linear function solution routines.
    (i.e. each term in the one-hot encoding should move to near the actual 
    value function)
    """
    
    # ======================== OVERRIDE STARTING HERE ==========================
    def init_w_vector(self):
        """Initialize the weights vector and the number of entries, N."""
        
        # initialize a weights numpy array with random values.
        N = len(self.saD)
        self.w_vector = np.random.randn(N) / np.sqrt(N)
        self.N = len( self.w_vector )
                
    def get_sa_x_vector(self, s_hash, a_desc):
        """
        Return the x vector that represents the (s,a) pair.
        
        NOTE: the index into x_vector for (s,a) = self.saD[ (s_hash, a_desc) ]
        """
        x_vector = np.zeros(self.N, dtype=np.float)
        x_vector[ self.saD[(s_hash, a_desc)] ] = 1.0
        return x_vector
    # ======================== OVERRIDE ENDING HERE ==========================

    def QsaEst(self, s_hash, a_desc):
        """Return the current estimate for Q(s,a) from linear function eval."""
        
        x_vector = self.get_sa_x_vector( s_hash, a_desc )
        return self.w_vector.dot( x_vector )
    
    def __init__(self, environment):
        
        self.environment = environment
        
        # initialize known (s,a) pairs.
        self.saD = {}
        for s_hash in self.environment.iter_all_states():
            for a_desc in self.environment.get_state_legal_action_list( s_hash ):
                # set dict value to index of numpy array
                self.saD[ (s_hash, a_desc) ] = len(self.saD)
        
        # aD index=a_desc, value=last change to Q(s,a) value, float
        self.last_delta_QsaD = {} # index=s_hash value=aD (dict)
        
        self.chgTracker = ChangeTracker()
        self.init_tracking()
        
        # initialize a weights numpy array with random values.
        self.init_w_vector()
        # e.g.  self.w_vector = np.random.randn(self.N) / np.sqrt(self.N)
        self.N = len(self.w_vector)

    def init_tracking(self):
        # initialize to init_val for all states, terminal = 0.0
        for s_hash in self.environment.iter_all_states():
            if s_hash not in self.saD:
                self.saD[s_hash] = {}
                self.last_delta_QsaD[s_hash] = {}

            # may not be any actions in terminal state, so set None action.
            if s_hash in self.environment.terminal_set:
                self.last_delta_QsaD[s_hash][ a_desc ] = 0.0

            aL = self.environment.get_state_legal_action_list( s_hash )
            for a_desc in aL:
                self.last_delta_QsaD[s_hash][ a_desc ] = 0.0
                

    def get_number_of_changes(self):
        return self.chgTracker.get_number_of_changes()

    def num_Qsa(self):
        return len( self.saD )

    def get_best_eps_greedy_action(self, s_hash, epsgreedy_obj=None ):
        """
        Pick the best action for state "s_hash" based on max Q(s,a)
        If epsgreedy_obj is given, apply Epsilon Greedy logic to choice.
        """
        a_descL = self.environment.get_state_legal_action_list( s_hash )
        if a_descL:
            best_a_desc, best_a_val = a_descL[0], float('-inf')
            bestL = [best_a_desc]
            for a in a_descL:
                q = self.QsaEst( s_hash, a )
                if q > best_a_val:
                    best_a_desc, best_a_val = a, q
                    bestL = [ a ]
                elif q == best_a_val:
                    bestL.append( a )
            
            best_a_desc = random.choice( bestL )
            if epsgreedy_obj is not None:
                best_a_desc = epsgreedy_obj( best_a_desc, a_descL )
                    
            return best_a_desc
        return None

    def get_best_greedy_action(self, s_hash):
        return self.get_best_eps_greedy_action( s_hash )

    def get_max_Qsa(self, s_hash):
        """return the maximum Q(s,a) for state, s_hash."""
        a_best = self.get_best_greedy_action( s_hash )
        if a_best is None:
            return None
        return self.QsaEst( s_hash, a_best )

    def record_changes(self, s_hash, a_desc, delta ):
        """Keep track of changes made to Q(s,a) values"""
        
        delta = abs(delta) # make sure that only absolute values are saved.
        
        # remove any record of last change to [s_hash][a_desc]
        self.chgTracker.dec_change( self.last_delta_QsaD[s_hash][ a_desc ] )
        
        # add delta to tracking record
        self.chgTracker.inc_change( delta )
        
        # remember that delta was last change to  [s_hash][a_desc]
        self.last_delta_QsaD[s_hash][ a_desc ] = delta
    
    def get_biggest_action_state_err(self):
        """Estimate the biggest error in all the action values."""
        #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change())
        return self.chgTracker.get_biggest_change()

    def get_max_last_delta_overall(self):
        """ get biggest entry in self.last_delta_QsaD # index=s_hash value=aD (dict)"""
        d_max = 0.0
        for aD in self.last_delta_QsaD.values():
            for val in aD.values():
                d_max = max(d_max, abs(val))
        return d_max

    def get_policy(self):
        policy = Policy( environment=self.environment )
        for s_hash in self.environment.iter_all_action_states():
            a_desc = self.get_best_greedy_action( s_hash )
            policy.set_sole_action( s_hash, a_desc)
        return policy
    
    def get_gradient(self, s_hash, a_desc):
        """
        Return the gradient of value function with respect to w_vector.
        Since the function is linear in w, the gradient is = x_vector.
        """
        return self.get_sa_x_vector( s_hash, a_desc )

    def sarsa_update(self, s_hash='', a_desc='', alpha=0.1, gamma=1.0,
                     sn_hash='', an_desc='', reward=0.0):
        """
        Do a SARSA, Temporal-Difference-style learning rate update.
        Use estimated Q(s,a) values by evaluating linear function approximation.
        w = w + alpha * [R + gamma*QEst(s',a') - QEst(s,a)] * grad(s,a)
        """
        Qsat = self.QsaEst( s_hash, a_desc )
        
        if sn_hash in self.environment.terminal_set:
            delta = alpha * (reward - Qsat)
        else:
            Qsatp1 = self.QsaEst( sn_hash, an_desc )
            target_val = reward + gamma*Qsatp1

            delta = alpha * (target_val - Qsat)
        
        delta_vector = delta * self.get_gradient( s_hash, a_desc )
        self.w_vector += delta_vector

        # remember max amount of change due to [s_hash][a_desc]
        delta = np.max( np.absolute( delta_vector ) )
        self.record_changes( s_hash, a_desc, delta )

        return abs(delta) # return the absolute value of change

    def qlearning_update(self, s_hash='', a_desc='', sn_hash='',
                         alpha=0.1, gamma=1.0, reward=0.0):
        """
        Do a Q-Learning, Temporal-Difference-style learning rate update.
        Use estimated Q(s,a) values by evaluating linear function approximation.
        w = w + alpha * [R + gamma* max(QEst(s',a')) - QEst(s,a)] * grad(s,a)
        """
        Qsat = self.QsaEst( s_hash, a_desc )

        if sn_hash in self.environment.terminal_set:
            delta = alpha * (reward - Qsat)
        else:
            # find best Q(s',a')
            an_descL = self.environment.get_state_legal_action_list( sn_hash )
            
            if an_descL:
                best_a_desc, best_a_val = an_descL[0], float('-inf')
                for a in an_descL:
                    q = self.QsaEst( sn_hash, a )
                    if q > best_a_val:
                        best_a_desc, best_a_val = a, q
            else:
                best_a_val = 0.0
            
            # use best Q(s',a') to update Q(s,a)
            target_val = reward + gamma * best_a_val
            delta = alpha * (target_val - Qsat)

        delta_vector = delta * self.get_gradient( s_hash, a_desc )
        self.w_vector += delta_vector

        # remember max amount of change due to [s_hash][a_desc]
        delta = np.max( np.absolute( delta_vector ) )
        self.record_changes( s_hash, a_desc, delta )

        return abs(delta) # return the absolute value of change

    # ========================== pickle routines ===============================

    def make_pickle_filename(self, fname):
        """Make a file name ending with .qlf_pickle """
        if fname is None:
            fname = self.name.replace(' ','_') + '.qlf_pickle'

        else:
            fname = fname.replace(' ','_').replace('.','_') + '.qlf_pickle'

        return fname

    def save_to_pickle_file(self, fname=None): # pragma: no cover
        """Saves data to pickle file."""
        # build name for pickle
        fname = self.make_pickle_filename( fname )

        saveD = {}
        saveD['saD'] = self.saD
        saveD['last_delta_QsaD'] = self.last_delta_QsaD
        saveD['w_vector'] = self.w_vector

        fileObject = open(fname,'wb')
        pickle.dump(saveD,fileObject, protocol=2)# protocol=2 is python 2&3 compatible.
        fileObject.close()
        print('Saved ActionValueColl to file:',fname)

    def read_pickle_file(self, fname=None): # pragma: no cover
        """Reads data from pickle file."""

        fname = self.make_pickle_filename( fname )
        if not os.path.isfile( fname ):
            print('Pickle File NOT found:', fname)
            return False

        fileObject = open(fname,'rb')
        readD = pickle.load(fileObject)

        saD = readD['saD']
        last_delta_QsaD = readD['last_delta_QsaD']
        w_vector = readD['w_vector']

        fileObject.close()
        print('Read ActionValueColl from file:',fname)

        return saD, last_delta_QsaD, w_vector

    def init_from_pickle_file(self, fname=None): # pragma: no cover
        """Initialize ActionValueColl from policy pickle file."""
        saD, last_delta_QsaD, w_vector = self.read_pickle_file( fname=fname )
        if saD:
            self.saD = saD
            self.w_vector = w_vector
            self.last_delta_QsaD = last_delta_QsaD
            self.N = len(self.w_vector)
            self.chgTracker = ChangeTracker()
            self.init_tracking()
        else:
            print('ERROR... Failed to read file:', fname)


    # ========================== summ_print ===============================

    def summ_print(self, fmt_Q='%.3f', none_str='*', show_states=True, 
                   show_last_change=True, show_policy=True):
        print()
        print('___ "%s" Action-Value Summary ___'%self.environment.name  )

        if self.environment.layout is not None:
            # make summ_print using environment.layout
            if show_states:
                self.environment.layout.s_hash_print( none_str='*' )

            row_tickL = self.environment.layout.row_tickL
            col_tickL = self.environment.layout.col_tickL
            x_axis_label = self.environment.layout.x_axis_label
            y_axis_label = self.environment.layout.y_axis_label

            d_max = self.get_max_last_delta_overall()
            if d_max==0.0:
                d_max = 1.0E-10

            rows_outL = []
            last_delta_rows_outL = [] # if show_last_change == True
            for row in self.environment.layout.s_hash_rowL:
                outL = []
                ld_outL = []
                for s_hash in row:
                    if not self.environment.is_legal_state( s_hash ):
                        if is_literal_str( s_hash ):
                            outL.append( s_hash[1:-1] )
                            ld_outL.append( s_hash[1:-1] )
                        else:
                            outL.append( none_str )
                            ld_outL.append( none_str )
                    else: # s_hash is a legal state hash
                        aL = self.environment.get_state_legal_action_list( s_hash )
                        sL = [str(s_hash)]
                        ld_sL = [str(s_hash)]
                        for a_desc in aL:
                            qsa = self.QsaEst( s_hash, a_desc )
                            s = fmt_Q%qsa
                            sL.append( '%s='%str(a_desc) + s.strip()  )
                            try:
                                d_val = int(100.0*self.last_delta_QsaD[s_hash].get( a_desc )/d_max)
                                if d_val > 0:
                                    lds = '%i%%'%d_val
                                    ld_sL.append( '%s='%str(a_desc) + lds.strip()  )
                                else:
                                    ld_sL.append( '%s~0'%str(a_desc) )
                            except:
                                ld_sL.append( '%s=None'%str(a_desc) )
                                
                        outL.append(  '\n'.join(sL).strip()  )
                        ld_outL.append(  '\n'.join(ld_sL).strip()  )
                rows_outL.append( outL )
                last_delta_rows_outL.append( ld_outL )

            print_string_rows( rows_outL, row_tickL=row_tickL, const_col_w=True,
                               line_chr='_', left_pad='    ', col_tickL=col_tickL,
                               header=self.environment.name + ' Action-Value Summary, Q(s,a)',
                               x_axis_label=x_axis_label, y_axis_label=y_axis_label,
                               justify='right')

            if show_last_change:
                print_string_rows( last_delta_rows_outL, row_tickL=row_tickL, const_col_w=True,
                                   line_chr='_', left_pad='    ', col_tickL=col_tickL,
                                   header=self.environment.name + ' Last %% of Max Change to Q(s,a) Summary, (max change=%g)'%d_max,
                                   x_axis_label=x_axis_label, y_axis_label=y_axis_label,
                                   justify='right')

            if show_policy:
                policy = self.get_policy()
                policy.summ_print(verbosity=0, environment=self.environment)

        # ------------------------- simple output w/o a layout ------------
        else:
            lmax_hash = 6

            outL = [] # list of strings "(s_hash,a_desc)=Q"
            for s_hash in self.environment.iter_all_states():
                aL = self.environment.get_state_legal_action_list( s_hash )
                for a_desc in aL:
                    qsa = self.QsaEst( s_hash, a_desc )
                
                    q = fmt_Q%qsa
                    s = '(%s, %s)='%(str(s_hash),str(a_desc)) + q.strip()
                    if show_last_change:
                        s = s + ' Last Delta = %s'%self.last_delta_QsaD[s_hash].get( a_desc, None)
                    
                    outL.append( s )
                    lmax_hash = max(lmax_hash, len(s))
            outL.sort() # sort in-place
            for s in outL:
                print('    ', s )
Exemple #4
0
class ActionValueColl(object):
    def __init__(self, environment, init_val=0.0):
        """
        A Collection of Action-Value, Q(s,a) floating point values
        for each state-action pair in the environment

        Each value can be updated with a learning rate (alpha)

        To get value use:
        qsa.get_val( s_hash, a_desc ):

        To update value use:
        sv.delta_update( s_hash, a_desc, delta)
        sv.sarsa_update( s_hash, a_desc, alpha, gamma,
                         sn_hash, an_desc, reward)

        (Terminal States have Q(s,a) = 0.0)
        """

        self.environment = environment

        self.QsaD = {
        }  # index=s_hash value=aD (dict),  aD index=a_desc, value=Q(s,a) value, float

        # aD index=a_desc, value=last change to Q(s,a) value, float
        self.last_delta_QsaD = {}  # index=s_hash value=aD (dict)

        self.chgTracker = ChangeTracker()

        self.init_Qsa_to_val(init_val)

        self.init_val = init_val

    def get_number_of_changes(self):
        return self.chgTracker.get_number_of_changes()

    def merge_active_value_coll(self, av_coll_2):
        """Merge self and av_coll_2 into a single ActionValueColl object"""
        av_result = copy.deepcopy(self)
        for s_hash, aD in self.QsaD.items():
            for a_desc, Q in aD.items():
                av_result.QsaD[s_hash][a_desc] = (self.QsaD[s_hash][a_desc] +\
                                             av_coll_2.QsaD[s_hash][a_desc]) / 2.0
        return av_result

    def build_sv_from_av(self):
        """
        Build a StateValueColl from this ActionValueColl
        NOTE: Any policy derived directly from the resulting StateValueColl will 
        LIKELY BE DIFFERENT from a policy derived directly from this ActionValueColl.
        """

        sv = StateValueColl(self.environment)
        for s_hash, aD in self.QsaD.items():
            best_val = float('-inf')
            for a_desc, Q in aD.items():
                if self.QsaD[s_hash][a_desc] > best_val:
                    best_val = self.QsaD[s_hash][a_desc]
            sv.VsD[s_hash] = best_val

        return sv

    def num_Qsa(self):
        return len(self.QsaD)

    def init_Qsa_to_val(self, init_val):
        # initialize to init_val for all states, terminal = 0.0
        for s_hash in self.environment.iter_all_states():
            if s_hash not in self.QsaD:
                self.QsaD[s_hash] = {}
                self.last_delta_QsaD[s_hash] = {}

            # may not be any actions in terminal state, so set None action.
            if s_hash in self.environment.terminal_set:
                self.QsaD[s_hash][None] = 0.0
                self.last_delta_QsaD[s_hash][a_desc] = 0.0

            aL = self.environment.get_state_legal_action_list(s_hash)
            for a_desc in aL:
                self.last_delta_QsaD[s_hash][a_desc] = 0.0

                # some terminal states have actions to themselves.
                if s_hash in self.environment.terminal_set:
                    self.QsaD[s_hash][a_desc] = 0.0
                else:
                    self.QsaD[s_hash][a_desc] = init_val

    def get_best_eps_greedy_action(self, s_hash, epsgreedy_obj=None):
        """
        Pick the best action for state "s_hash" based on max Q(s,a)
        If epsgreedy_obj is given, apply Epsilon Greedy logic to choice.
        """
        a_descL = self.environment.get_state_legal_action_list(s_hash)
        if a_descL:
            best_a_desc, best_a_val = a_descL[0], float('-inf')
            bestL = [best_a_desc]
            for a in a_descL:
                q = self.QsaD[s_hash][a]
                if q > best_a_val:
                    best_a_desc, best_a_val = a, q
                    bestL = [a]
                elif q == best_a_val:
                    bestL.append(a)

            best_a_desc = random.choice(bestL)
            if epsgreedy_obj is not None:
                best_a_desc = epsgreedy_obj(best_a_desc, a_descL)

            return best_a_desc
        return None

    def get_best_greedy_action(self, s_hash):
        return self.get_best_eps_greedy_action(s_hash)

    def get_max_Qsa(self, s_hash):
        """return the maximum Q(s,a) for state, s_hash."""
        a_best = self.get_best_greedy_action(s_hash)
        if a_best is None:
            return None
        return self.get_val(s_hash, a_best)

    def get_best_dbl_eps_greedy_action(self,
                                       av_coll_2,
                                       s_hash,
                                       epsgreedy_obj=None):
        """
        Pick the best action for state "s_hash" based on COMBINED max Q(s,a)
        If epsgreedy_obj is given, apply Epsilon Greedy logic to choice.
        """
        a_descL = self.environment.get_state_legal_action_list(s_hash)
        if a_descL:
            best_a_desc, best_a_val = a_descL[0], float('-inf')
            bestL = [best_a_desc]
            for a in a_descL:
                q1 = self.QsaD[s_hash][a]
                q2 = av_coll_2.QsaD[s_hash][a]
                q = q1 + q2

                if q > best_a_val:
                    best_a_desc, best_a_val = a, q
                    bestL = [a]
                elif q == best_a_val:
                    bestL.append(a)

            best_a_desc = random.choice(bestL)
            if epsgreedy_obj is not None:
                best_a_desc = epsgreedy_obj(best_a_desc, a_descL)

            return best_a_desc
        return None

    def record_changes(self, s_hash, a_desc, delta):
        """Keep track of changes made to Q(s,a) values"""

        delta = abs(delta)  # make sure that only absolute values are saved.

        # remove any record of last change to [s_hash][a_desc]
        self.chgTracker.dec_change(self.last_delta_QsaD[s_hash][a_desc])

        # add delta to tracking record
        self.chgTracker.inc_change(delta)

        # remember that delta was last change to  [s_hash][a_desc]
        self.last_delta_QsaD[s_hash][a_desc] = delta

    def get_biggest_action_state_err(self):
        """Estimate the biggest error in all the action values."""
        #print('self.chgTracker.get_biggest_change()', self.chgTracker.get_biggest_change())
        return self.chgTracker.get_biggest_change()

    def get_val(self, s_hash, a_desc):
        """Return the  Action-Value for (s_hash, a_desc)"""
        return self.QsaD[s_hash][a_desc]  # Allow key error

    def delta_update(self, s_hash='', a_desc='', delta=0.0):
        """
        Add delta to current value of Q(s,a) for s_hash, a_desc
        """
        self.QsaD[s_hash][a_desc] += delta

        # remember amount of change to [s_hash][a_desc]
        self.record_changes(s_hash, a_desc, delta)

    def qlearning_update(self,
                         s_hash='',
                         a_desc='',
                         sn_hash='',
                         alpha=0.1,
                         gamma=1.0,
                         reward=0.0):
        """
        Do a Q-Learning, Temporal-Difference-style learning rate update.
        Q(s,a) = Q(s,a) + alpha * [R + gamma* max(Q(s',a')) - Q(s,a)]
        """
        Qsat = self.QsaD[s_hash][a_desc]  # allow key error

        # find best Q(s',a')
        an_descL = self.environment.get_state_legal_action_list(sn_hash)

        if an_descL:
            best_a_desc, best_a_val = an_descL[0], float('-inf')
            for a in an_descL:
                q = self.QsaD[sn_hash][a]
                if q > best_a_val:
                    best_a_desc, best_a_val = a, q
        else:
            best_a_val = 0.0

        # use best Q(s',a') to update Q(s,a)
        target_val = reward + gamma * best_a_val
        delta = alpha * (target_val - Qsat)
        self.QsaD[s_hash][a_desc] += delta

        # remember amount of change to [s_hash][a_desc]
        self.record_changes(s_hash, a_desc, delta)

        return abs(delta)  # return the absolute value of change

    def dbl_qlearning_update(self,
                             av_coll_2,
                             s_hash='',
                             a_desc='',
                             sn_hash='',
                             alpha=0.1,
                             gamma=1.0,
                             reward=0.0):
        """
        Do a Double Q-Learning, Temporal-Difference-style learning rate update.
        Given a 2nd ActionValueColl, av_coll_2, update EITHER self, or av_coll_2.
        
        Q(s,a) = Q(s,a) + alpha * [R + gamma* max(Q(s',a')) - Q(s,a)]
        """

        # randomly decide which Q(s,a) to update, self or av_coll_2
        if random.random() < 0.5:
            # use best Q(s',a') to update "self" Q(s,a)
            Qsat = self.QsaD[s_hash][a_desc]  # allow key error
            best_a_desc = self.get_best_greedy_action(sn_hash)

            q = av_coll_2.QsaD[sn_hash][best_a_desc]
            target_val = reward + gamma * q
            delta = alpha * (target_val - Qsat)
            self.QsaD[s_hash][a_desc] += delta

            # remember amount of change to [s_hash][a_desc]
            self.record_changes(s_hash, a_desc, delta)
        else:
            # use best Q(s',a') to update "av_coll_2" Q(s,a)
            Qsat = av_coll_2.QsaD[s_hash][a_desc]  # allow key error
            best_a_desc = av_coll_2.get_best_greedy_action(sn_hash)

            q = self.QsaD[sn_hash][best_a_desc]
            target_val = reward + gamma * q
            delta = alpha * (target_val - Qsat)
            av_coll_2.QsaD[s_hash][a_desc] += delta

            # remember amount of change to [s_hash][a_desc]
            av_coll_2.record_changes(s_hash, a_desc, delta)

        return abs(delta)  # return the absolute value of change

    def sarsa_update(self,
                     s_hash='',
                     a_desc='',
                     alpha=0.1,
                     gamma=1.0,
                     sn_hash='',
                     an_desc='',
                     reward=0.0):
        """
        Do a SARSA, Temporal-Difference-style learning rate update.
        Q(s,a) = Q(s,a) + alpha * [R + gamma*Q(s',a') - Q(s,a)]
        """
        Qsat = self.QsaD[s_hash][a_desc]  # allow key error
        Qsatp1 = self.QsaD[sn_hash][an_desc]
        target_val = reward + gamma * Qsatp1

        delta = alpha * (target_val - Qsat)
        self.QsaD[s_hash][a_desc] += delta

        # remember amount of change to [s_hash][a_desc]
        self.record_changes(s_hash, a_desc, delta)

        return abs(delta)  # return the absolute value of change

    def expected_sarsa_update(self,
                              s_hash='',
                              a_desc='',
                              alpha=0.1,
                              gamma=1.0,
                              epsilon=0.1,
                              sn_hash='',
                              reward=0.0):
        """
        Do an Expected SARSA, Temporal-Difference-style learning rate update.
        Q(s,a) = Q(s,a) + alpha * [R + gamma * Expected[Q(s',a')] - Q(s,a)]
        """

        an_best = self.get_best_greedy_action(sn_hash)
        expected_val = (1.0 - epsilon) * self.QsaD[sn_hash][an_best]

        an_descL = self.environment.get_state_legal_action_list(sn_hash)
        if an_descL:
            frac = epsilon / len(an_descL)
            for an_desc in an_descL:
                expected_val += frac * self.QsaD[sn_hash][an_desc]

        target_val = reward + gamma * expected_val

        delta = alpha * (target_val - self.QsaD[s_hash][a_desc])
        self.QsaD[s_hash][a_desc] += delta

        # remember amount of change to [s_hash][a_desc]
        self.record_changes(s_hash, a_desc, delta)

        return abs(delta)  # return the absolute value of change

    def make_pickle_filename(self, fname):
        """Make a file name ending with .qsa_pickle """
        if fname is None:
            fname = self.name.replace(' ', '_') + '.qsa_pickle'

        else:
            fname = fname.replace(' ', '_').replace('.', '_') + '.qsa_pickle'

        return fname

    def save_to_pickle_file(self, fname=None):  # pragma: no cover
        """Saves data to pickle file."""
        # build name for pickle
        fname = self.make_pickle_filename(fname)

        saveD = {}
        saveD['QsaD'] = self.QsaD

        fileObject = open(fname, 'wb')
        pickle.dump(saveD, fileObject,
                    protocol=2)  # protocol=2 is python 2&3 compatible.
        fileObject.close()
        print('Saved ActionValueColl to file:', fname)

    def read_pickle_file(self, fname=None):  # pragma: no cover
        """Reads data from pickle file."""

        fname = self.make_pickle_filename(fname)
        if not os.path.isfile(fname):
            print('Pickle File NOT found:', fname)
            return False

        fileObject = open(fname, 'rb')
        readD = pickle.load(fileObject)

        QsaD = readD['QsaD']

        fileObject.close()
        print('Read ActionValueColl from file:', fname)

        return QsaD

    def init_from_pickle_file(self, fname=None):  # pragma: no cover
        """Initialize ActionValueColl from policy pickle file."""
        QsaD = self.read_pickle_file(fname=fname)
        if QsaD:
            self.QsaD = QsaD

    def get_max_last_delta_overall(self):
        """ get biggest entry in self.last_delta_QsaD # index=s_hash value=aD (dict)"""
        d_max = 0.0
        for aD in self.last_delta_QsaD.values():
            for val in aD.values():
                d_max = max(d_max, abs(val))
        return d_max

    def get_policy(self):

        policy = Policy(environment=self.environment)
        for s_hash in self.environment.iter_all_action_states():
            a_desc = self.get_best_greedy_action(s_hash)
            policy.set_sole_action(s_hash, a_desc)
        return policy

    def summ_print(self,
                   fmt_Q='%.3f',
                   none_str='*',
                   show_states=True,
                   show_last_change=True,
                   show_policy=True):
        print()
        print('___ "%s" Action-Value Summary ___' % self.environment.name)

        if self.environment.layout is not None:
            # make summ_print using environment.layout
            if show_states:
                self.environment.layout.s_hash_print(none_str='*')

            row_tickL = self.environment.layout.row_tickL
            col_tickL = self.environment.layout.col_tickL
            x_axis_label = self.environment.layout.x_axis_label
            y_axis_label = self.environment.layout.y_axis_label

            d_max = self.get_max_last_delta_overall()
            if d_max == 0.0:
                d_max = 1.0E-10

            rows_outL = []
            last_delta_rows_outL = []  # if show_last_change == True
            for row in self.environment.layout.s_hash_rowL:
                outL = []
                ld_outL = []
                for s_hash in row:
                    if not self.environment.is_legal_state(s_hash):
                        if is_literal_str(s_hash):
                            outL.append(s_hash[1:-1])
                            ld_outL.append(s_hash[1:-1])
                        else:
                            outL.append(none_str)
                            ld_outL.append(none_str)
                    else:
                        aD = self.QsaD[s_hash]
                        sL = [str(s_hash)]
                        ld_sL = [str(s_hash)]
                        for a_desc, qsa in aD.items():
                            s = fmt_Q % qsa
                            sL.append('%s=' % str(a_desc) + s.strip())
                            try:
                                d_val = int(
                                    100.0 *
                                    self.last_delta_QsaD[s_hash].get(a_desc) /
                                    d_max)
                                if d_val > 0:
                                    lds = '%i%%' % d_val
                                    ld_sL.append('%s=' % str(a_desc) +
                                                 lds.strip())
                                else:
                                    ld_sL.append('%s~0' % str(a_desc))
                            except:
                                ld_sL.append('%s=None' % str(a_desc))

                        outL.append('\n'.join(sL).strip())
                        ld_outL.append('\n'.join(ld_sL).strip())
                rows_outL.append(outL)
                last_delta_rows_outL.append(ld_outL)

            print_string_rows(rows_outL,
                              row_tickL=row_tickL,
                              const_col_w=True,
                              line_chr='_',
                              left_pad='    ',
                              col_tickL=col_tickL,
                              header=self.environment.name +
                              ' Action-Value Summary, Q(s,a)',
                              x_axis_label=x_axis_label,
                              y_axis_label=y_axis_label,
                              justify='right')

            if show_last_change:
                print_string_rows(
                    last_delta_rows_outL,
                    row_tickL=row_tickL,
                    const_col_w=True,
                    line_chr='_',
                    left_pad='    ',
                    col_tickL=col_tickL,
                    header=self.environment.name +
                    ' Last %% of Max Change to Q(s,a) Summary, (max change=%g)'
                    % d_max,
                    x_axis_label=x_axis_label,
                    y_axis_label=y_axis_label,
                    justify='right')

            if show_policy:
                policy = self.get_policy()
                policy.summ_print(verbosity=0, environment=self.environment)

        # ------------------------- simple output w/o a layout ------------
        else:
            lmax_hash = 6

            outL = []  # list of strings "(s_hash,a_desc)=Q"
            for s_hash in self.QsaD.keys():
                for a_desc, qsa in self.QsaD[s_hash].items():
                    q = fmt_Q % self.QsaD[s_hash][a_desc]
                    s = '(%s, %s)=' % (str(s_hash), str(a_desc)) + q.strip()
                    if show_last_change:
                        s = s + ' Last Delta = %s' % self.last_delta_QsaD[
                            s_hash].get(a_desc, None)

                    outL.append(s)
                    lmax_hash = max(lmax_hash, len(s))
            outL.sort()  # sort in-place
            for s in outL:
                print('    ', s)