def evaluate(index): card = cards[index] choices = stacks[np.array([s[-1] < card for s in stacks])] if len(choices) == 0: e = (1.0 - card / NUM_CARDS) * min( count_nimmts(s) for s in stacks) else: stack = choices[np.argmax([s[-1] for s in choices])] if len(stack) == STACK_VOL: e = (1.0 - (card - stack[-1]) / NUM_CARDS) * count_nimmts(stack) else: e = ((-0.1 + card - stack[-1]) / NUM_CARDS) / (STACK_VOL - len(stack)) next = cards[index + 1] if index < len(cards) - 1 else NUM_CARDS e += ((len(stack) == STACK_VOL - 1) * 2 - 1) * (1.0 - (next - card)) / NUM_CARDS return e
def evaluate(index): card = cards[index] choices = stacks[np.array([s[-1] < card for s in stacks])] if len(choices) == 0: e = (1.0 - card / NUM_CARDS) * min( count_nimmts(s) for s in stacks) else: stack = choices[np.argmax([s[-1] for s in choices])] if len(stack) == STACK_VOL: e = (1.0 - (card - stack[-1]) / NUM_CARDS) * count_nimmts(stack) else: e = ((-0.1 + card - stack[-1]) / NUM_CARDS) / ( STACK_VOL - len(stack)) * count_nimmts(stack) next = NUM_CARDS for s in stacks: if s[-1] > card: next = min(next, s[-1]) #if len(stack) =4: # e -= (next - card) if card != cards[-1]: next = min(next, cards[index + 1]) if len(stack) == 4: e -= (next - card - 1.0) / NUM_CARDS * count_nimmts(stack + [ card, ]) else: e += (next - card - 1.0) / NUM_CARDS * count_nimmts(stack + [ card, ]) #else: # e -= #e += ((len(stack) == STACK_VOL - 1) * 2 - 1) * (1.0 - (next - card)) / NUM_CARDS return e
def normalize(dat) : bits = []; action = dat['action'] card_stacks = dat['state']['card_stacks'] card_status = dat['state']['card_status'] agent_id = dat['state']['agent_id'] hand_card = dat['state']['hand_cards'][agent_id] for i in range(16): l_and = True l_or = False for j in range(4): if ((1<<j)&i > 0): l_and = l_and and (action < card_stacks[j][-1]) l_or = l_or or (action < card_stacks[j][-1]) bits.append(int(l_and)) bits.append(int(l_or)) bits.append(1.0 * action / pyrl.common.num_agent_init_card) for i in range(4): bits.append(1.0 * card_stacks[i][-1] / pyrl.common.num_agent_init_card) tmp = [] for i in range(4): for j in range(5): tmp.append(j == len(card_stacks[i])) target_stack = -1 for j in range(4): if action > card_stacks[j][-1]: if target_stack == -1 or card_stacks[j][-1] > card_stacks[target_stack][-1]: target_stack = j for i in range(4): tmp.append(action > card_stacks[i][-1]) tmp.append(target_stack == i) tmp.append(target_stack == -1) for i in range(num_base_ind): bits.append(int(tmp[i])) for i in range(20): for j in range(20, num_base_ind): bits.append(int(tmp[i] and tmp[j])) i2 = ind_2 i3 = ind_3 i4 = ind_3 for i in range(50): bits.append(int(tmp[i2[i][0]] and tmp[i2[i][1]])) bits.append(int(tmp[i3[i][0]] and tmp[i3[i][1]] and tmp[i3[i][2]])) bits.append(int(tmp[i4[i][0]] and tmp[i4[i][1]] and tmp[i4[i][2]] and tmp[i4[i][3]])) status_bucket = [[0 for col in range(13)] for row in range(3)] #every 13 in a bucket for i in range(num_cards): status_bucket[card_status[i]][i/8] += 1 bits = bits + status_bucket[0] + status_bucket[1] + status_bucket[2] if target_stack == -1: bits.append(min([count_nimmts(s) for s in card_stacks])) bits.append(min([count_nimmts(s) for s in card_stacks])) bits.append(min([count_nimmts(s) for s in card_stacks])) else: bits.append(count_nimmts(card_stacks[target_stack])) bits.append(count_nimmts(card_stacks[target_stack]) * int(len(card_stacks[target_stack]) >= 5)) bits.append(count_nimmts(card_stacks[target_stack]) * int(len(card_stacks[target_stack]) >= 4)) bits.append(count_nimmts(hand_card)) punish_bucket = [[0 for col in range(7)] for row in range(3)] for i in range(num_cards): punish_bucket[card_status[i]][num_nimmt(i)-1] += 1 bits = bits + punish_bucket[0] + punish_bucket[1] + punish_bucket[2] #in all there are 459 features return bits
def policy_min(self, agentEnv): stacks = agentEnv['card_stacks'] ret = np.argmin([count_nimmts(s) for s in stacks]) return ret