def makeCPTpure(self, setCPT=True): """Convert mixed CPT to pure CPT, :arg setCPT: if True (default), then the CPT attribute is converted to a pure CPT. Otherwise, the output is a pure CPT. :type setCPT: bool .. note:: whenever there are multiple argmax's, each gets equal probability in the resuling "pure" CPT. """ if setCPT: self.CPT = convert_2_pureCPT(self.CPT) else: return convert_2_pureCPT(copy.copy(self.CPT))
def train_node(self, nodename, level, logit=False, setCPT=False, verbose=False): """Compute level-k best response at the DN given Game :arg nodename: the name of the decision node where MCEUs are estimated :type nodename: str :arg level: The level at which to train that player :type level: int :arg setCPT: If the trained CPT should be set as the current CPT. Otherwise, it can be accessed through node.LevelCPT. Default is False :type setCPT: bool Notes ----- If training a player at level k, the other players' CPT will be accessed through self.Game.node_dict[other_player].LevelCPT[k-1] """ print 'Training ' + nodename + ' at level ' + str(level) Game = copy.deepcopy(self.Game) # copy in order to maintain original CPT ps = self.specs for node in Game.node_dict.values(): # Game changes, self.Game doesn't if type(node) is pynfg.DecisionNode: try: node.CPT = node.LevelCPT[level - 1] except KeyError: raise KeyError('Need to train other players at level %s' % str(level-1)) EUtable = mceu(Game, nodename, Game.node_dict[nodename].N, Game.node_dict[nodename].tol, Game.node_dict[nodename].delta, verbose=verbose) if not logit: self.Game.node_dict[nodename].LevelCPT[level] = \ convert_2_pureCPT(EUtable) if setCPT: self.Game.node_dict[nodename].CPT = convert_2_pureCPT(EUtable) else: weight = np.exp(Game.node_dict[nodename].beta*EUtable) norm = np.sum(weight, axis=-1) self.Game.node_dict[nodename].LevelCPT[level] = \ weight/norm[..., np.newaxis] if setCPT: self.Game.node_dict[nodename].CPT = weight/norm[..., np.newaxis]
def bestresponse_node(Game, dn, N, delta=1, tol=30, verbose=False): """Compute level-k best response at the DN given Game :arg Game: the Network Form Game of interest :type Game: SemiNFG or iterSemiNFG :arg dn: the name of the decision node where MCEUs are estimated :type dn: str :arg N: the max number of iterations for the estimation :type N: int :arg tol: the minimum number of samples per parent value :type tol: int """ G = copy.deepcopy(Game) EUtable = mceu(G, dn, N, tol, delta, verbose) G.node_dict[dn].CPT = convert_2_pureCPT(EUtable) return G
def train_node(self, bn, level, setCPT=False): """Solve for the optimal policy using Optimistic Q-learning. Optimistic Q-Learning is an off-policy TD control RL algorithm :arg bn: The basename of the node with the CPT to be trained :type bn: str :arg level: The level at which to train the basename :type level: int """ print 'Training ' + bn + ' at level '+ str(level) Game = copy.deepcopy(self.Game) ps = self.specs player = Game.bn_part[bn][0].player w, d, N, r_max = ps[player]['w'], ps[player]['delta'], ps[player][bn]['N'], \ ps[player][bn]['r_max'] #Set other CPTs to level-1. Works even if CPTs aren't pointers. for o_player in Game.players: bn_list = list(set(map(lambda x: x.basename, Game.partition[o_player]))) for base in bn_list: if base != bn: for dn in Game.bn_part[base]: try: dn.CPT = \ (self.trained_CPTs[o_player][base][level - 1]) except KeyError: raise KeyError('Need to train other players at level %s' % str(level-1)) T0 = Game.starttime #get the start time T = Game.endtime + 1 #get the end time shape = Game.bn_part[bn][T0].CPT.shape #the shape of CPT if d<1: Q0 = r_max*((1-d**(T-T0))/(1-d)) #the initial q value else: Q0 = r_max*(T-T0) Q = Q0 * np.ones(shape) #the initial q table visit = np.zeros(shape) #the number of times each (m,a) pair has been visited. r_av = 0 #the dynamic (discounted) average reward rseries = [] #a series of average rewards for ep in xrange(N): print ep #convert Q table to CPT Game.bn_part[bn][T0].CPT = convert_2_pureCPT(Q) Game.sample_timesteps(T0,T0) #sample the start time step malist = Game.bn_part[bn][T0].dict2list_vals(valueinput= \ Game.bn_part[bn][T0].value) #get the list of (m,a) pair from the iterated semi-NFG mapair = Game.bn_part[bn][T0].get_CPTindex(malist) #get CPT index r = Game.reward(player,T0) #get the (discounted) reward if ep != 0: #to avoid "divided by 0" error r_av_new = r_av + (r-r_av)/((T-1)*ep) #update the dynamic reward Qmax = Q[mapair] #get the maximum q value for t in xrange(T0+1,T): Game.bn_part[bn][t].CPT = convert_2_pureCPT(Q) #convert Q table to CPT Game.sample_timesteps(t,t) #sample the current time step if t!= (T-1): #required by Q-learning r = d**t*Game.reward(player,t) # get the (discounted) reward r_av_new = r_av + (r-r_av)/((T-1)*ep+t) #update the reward malist_new = Game.bn_part[bn][t].dict2list_vals(valueinput= \ Game.bn_part[bn][t].value) mapair_new = Game.bn_part[bn][t].get_CPTindex(malist_new) visit[mapair] = visit[mapair] + 1 #update the number of times alpha = (1/(1+visit[mapair]))**w #the learning rate Qmax_new = Q[mapair_new] #new maximum q value Q[mapair] = Qmax + alpha*(r + d*Qmax_new -Qmax) #update q table mapair = mapair_new Qmax = Qmax_new r_av = r_av_new rseries.append(r_av) self.trained_CPTs[player][bn][level] = Game.bn_part[bn][0].CPT plt.figure() plt.plot(rseries, label = str(bn + ' Level ' + str(level))) #plotting rseries to gauge convergence plt.legend() fig = plt.gcf() self.figs[bn][str(level)] = fig if setCPT: map(lambda x: _setallCPTs(self.Game,bn, x, Game.bn_part[bn][0].CPT), np.arange(T0, T))