def step(self): if len(self.clauses) == 0: return torch.tensor(0, dtype=torch.float32) binding_dict, new_state = self.interp.state_query( self.state, self.clauses[-1]) self.state = new_state # update the success and possible fields self.check_possible(binding_dict) if not self.possible: self.obj_poss_left.append(0) elif not "var_0" in binding_dict.keys(): self.obj_poss_left.append(self.obj_nums) self.update_data(binding_dict) else: self.obj_poss_left.append(len(binding_dict["var_0"])) self.update_data(binding_dict) self.check_success(binding_dict) # TODO: the reward function need to be updated reward = get_reward(self) # print(reward) return torch.tensor(reward, dtype=torch.float32)
def step(self): if len(self.clauses) == 0: return torch.tensor(0, dtype=torch.float32) # binding_dict = query(self.graph.scene, self.clauses, self.config) next_clause = self.clauses[-1] useful = self.interp.useful_check(self.interp_state, next_clause) logging.info(f"useful: {useful}") binding_dict, new_state = self.interp.state_query( self.interp_state, next_clause) self.interp_state = new_state # update the success and possible fields self.check_possible(binding_dict) if not self.possible: self.obj_poss_left.append(0) elif not "var_0" in binding_dict.keys(): self.obj_poss_left.append(self.obj_nums) else: self.obj_poss_left.append(len(binding_dict["var_0"])) self.check_success(binding_dict) done = self.success or (not self.possible) if not useful: reward = torch.tensor(-1, dtype=torch.float32) else: reward = torch.tensor(get_reward(self), dtype=torch.float32) return reward
def step(self, action_idx): is_uncertain = self.is_uncertain next_clause = self.actions[action_idx] if self.ref_flag: for element in next_clause: if type(element) == int: if element not in self.ref: self.ref.append(element) self.unreachable = self.unreachable_dict[str(sorted(self.ref))] # if 'red' in next_clause or 'blue' in next_clause: # print('here') self.idx_selected.append(action_idx) self.clauses.append(next_clause) useful = self.interp.useful_check(self.state, next_clause) logging.info(f"useful: {useful}") binding_dict, new_state = self.interp.state_query( self.state, next_clause) self.state = new_state # update the success and possible fields self.check_possible(binding_dict) if not self.possible: self.obj_poss_left.append(0) elif not "var_0" in binding_dict.keys(): self.obj_poss_left.append(self.obj_nums) self.update_data(binding_dict) else: self.obj_poss_left.append(len(binding_dict["var_0"])) self.update_data(binding_dict) # update whether done or not self.check_success(binding_dict) done = self.success or (not self.possible) if not useful: reward = torch.tensor(-1, dtype=torch.float32) else: reward = torch.tensor(get_reward(self), dtype=torch.float32) self.update_data(binding_dict) info = {} logging.info(f"selected: {self.idx_selected}") logging.info(f"done: {done}") logging.info(f"success: {self.success}") if self.ref_flag: state = self.get_state(self.unreachable) else: state = self.get_state() return state, reward, done, info
def compute_q(MDP, V, s, a): S, A, R, P, gamma = MDP #s采取动作a之后可能的状态 q_sa = 0.0 for s_prime in S: q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime) q_sa = get_reward(R, s, a) + gamma * q_sa return q_sa
def _expansion_simulation(self, leaf_id, win_index): leaf_board = self.tree[leaf_id]['board'] current_player = self.tree[leaf_id]['player'] if win_index == 0: # expansion actions = utils.valid_actions(leaf_board) for action in actions: action_index = action[1] child_id = leaf_id + (action_index, ) child_board = utils.get_board(child_id, self.board_size) next_turn = utils.get_turn(child_id) self.tree[child_id] = { 'board': child_board, 'player': next_turn, 'parent': leaf_id, 'child': [], 'n': 0., 'w': 0., 'q': 0. } self.tree[leaf_id]['child'].append(action_index) if self.tree[leaf_id]['parent']: # simulation board_sim = leaf_board.copy() turn_sim = current_player while True: actions_sim = utils.valid_actions(board_sim) action_sim = actions_sim[np.random.choice( len(actions_sim))] coord_sim = action_sim[0] if turn_sim == 0: board_sim[coord_sim] = 1 else: board_sim[coord_sim] = -1 win_idx_sim = utils.check_win(board_sim, self.win_mark) if win_idx_sim == 0: turn_sim = abs(turn_sim - 1) else: reward = utils.get_reward(win_idx_sim, leaf_id) return reward else: # root node don't simulation reward = 0. return reward else: # terminal node don't expansion reward = 1. return reward
def get_her_transitions(self, stnd_replay): ''' Params : @ stnd_replay : base transition that actually occured ''' new_transitions = [] for i in range(len(stnd_replay)): try: samples = random.sample(stnd_replay[i + 1:], self.k) except: ''' return everything remaining if sample population is less than k ''' samples = stnd_replay[i + 1:] for sample in samples: new_goal = np.asarray(sample[3][:3]) for transition in stnd_replay[i:]: ''' If the current transition being looked at goes to the new goal state, then break out of the loop after setting reward to 0.0 Else create a new transition with new goal and add it to the trajectory ''' new_state = transition[0][:-3] new_next_state = transition[3][:-3] if (np.all(new_next_state[:3] == new_goal)): new_reward = 0.0 break else: new_reward = get_reward(self.env, new_next_state[:3], new_goal) ## normalize values before concatenating new_goal = normalizer(new_goal) new_state = normalizer(new_state, 5.0) new_next_state = normalizer(new_next_state, 5.0) action = normalizer(transition[1], 5.0) new_state = np.concatenate((new_state, new_goal), axis=0) new_next_state = np.concatenate((new_next_state, new_goal), axis=0) new_transition = [ new_state, action, new_reward, new_next_state ] new_transitions.append(new_transition) return (new_transitions)
def compute_q(MDP, V, s, a): '''根据给定的MDP, 价值函数V, 计算(状态行为对)s, a的价值qsa 公式 2.16 ''' S, A, R, P, gamma = MDP q_sa = 0 for s_prime in S: q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime) q_sa = get_reward(R, s, a) + gamma * q_sa return q_sa
def compute_q(MDP, V, s, a): ''' According to the given MDP, the value function V, calculate the value qsa of the state behavior to s, a formula $$q_{\pi}(s,a) = R^a_b + \gamma \sum_{s' \in S}P^a_{ss'} \nu \pi(s') $$ #markdown ''' S, A, R, P, gamma = MDP q_sa = 0 for s_prime in S: q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime) q_sa = get_reward(R, s, a) + gamma * q_sa return q_sa
def compute_q(MDP, V, s, a): '''根据给定的MDP,价值函数V,计算状态行为对s,a的价值qsa ''' S, A, R, P, gamma = MDP q_sa = 0 print('对当前的行为' + str(a) + '计算它的行为价值计算,对行为,计算其对应所有状态的价值') for s_prime in S: print('状态' + str(s) + '经过a行为,' + str(a) + '获取转移概率' + str(get_prob(P, s, a, s_prime)) + '以及分数' + str(get_value(V, s_prime))) q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime) print('状态' + str(s) + '经过a行为,' + str(a) + str(q_sa)) q_sa = get_reward(R, s, a) + gamma * q_sa print('算出这个行为价值为' + str(q_sa)) return q_sa
def main(): np.random.seed(seed) # Initialize the model architecture model = QMap((dx, dy, n_features), n_actions, seed=seed, load_model=save_path) game = pm.Game(dx=dx, dy=dy, number_of_turns=episode_length, default_capacity=25, servable_distance=3.0, initial_cost=50.0, operating_cost=25.0, profit_margin=5.0, unserviced_penalty=1.0) display = pm.GameDisplay(game, box_size, pop_scale) # The initial game state total_return = 0 prev_serviced = [0] # List for easy mutable argument for step in range(episode_length): s = get_state(game) assert (s.shape[1] == dx and s.shape[2] == dy and s.shape[3] == n_features) display.update() # predicted action-value and action taken q, a = [np.squeeze(item) for item in model.predict_q(s)] take_action(game, a) # actually take the selected action r = get_reward(game, prev_serviced) # See what the reward is total_return += r print("Q-Value: %f, Action selected: %s, Reward: %d" % (q, str(a), r))
def main(lamb, R): #read in file tmp = [] with open("../session.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() data = np.asarray(tmp) tmp = [] with open("../user_feature.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() user = np.asarray(tmp) tmp = [] with open("../app_feature.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() app = np.asarray(tmp) #initialization pool_size = app.shape[0] #user_row_n = user.shape[1] #feature length app_row_n = app.shape[1] d = app_row_n - 1 session_n = int(max(data[:, 0])) train_ratio = 0.5 gamma = 1 # lamb = 0.1 theta = np.zeros(d) beta = 1 delta = 0.9 V = lamb * np.eye(d) X = np.zeros((1, d), dtype=np.float) Y = np.zeros(1) x_feature = np.zeros((pool_size, d), dtype=np.float) UCB = np.zeros(pool_size, dtype=np.float) session = np.zeros(session_n, dtype=np.float) K = 5 click_n = 0 idex = np.random.permutation(session_n) tr_idx = idex[:int(round(session_n * train_ratio))] ts_idx = idex[int(round(session_n * train_ratio)):] # open file for storing log info cur_time = strftime("%Y%m%d_", localtime()) logFileName = '../LogFile/main_newtr_nouser' + cur_time + '.txt' logFile = open(logFileName, 'a+') print >> logFile, '\n\n' print >> logFile, '=' * 50 print >> logFile, 'experiment parameters: lambda=%f, R=%f' % (lamb, R) print >> logFile, '\n\n' #app = app[:,1:] expl = np.zeros(pool_size) #train for i in range(tr_idx.shape[0]): record = np.zeros(1) #user_feature = np.zeros(1) try: record = data[np.where(data[:, 0] == tr_idx[i])] #user_feature = user[np.where(user[:, 0] == record[0, 0])][0,1:] except IndexError: continue else: # app_dict = {} # val = [] # for r in range(record.shape[0]): # if record[r,1] not in app_dict: # app_dict[record[r,1]] = record[r,2] # else if record[r,1] == 11 or record[r,1] == 10: # app_dict[record[r,1]] = record[r,2] # for a in app_dict: # app_feature = app[np.where(app[:,0] == a)][0,1:] # x_feature[record[r,1]] = np.outer(user_feature, app_feature).reshape(1, d) # x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) # if app_dict[a]==61: # val.append(0) # else: # val.append(1) # val.np.asarray(val) # x_t = x_feature # w =np.array(val.reshape(val.shape[1],1)) # [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) val = [] idx = [] for a in range(pool_size): #get all apps from the session data of one user if app[a, 0] in record[:, 1]: idx.append(a) x_feature[a] = app[a, 1:] x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) #get the user's feedback to one app feedback = record[np.where(record[:, 1] == app[a, 0])][:, 2] if 10 in feedback or 11 in feedback: val.append(1) else: val.append(0) val = np.asarray(val) print val idx = np.asarray(idx) x_t = x_feature[idx, :] w = np.array(val.reshape(val.shape[1], 1)) # print w [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) #test cnt = 0 result = [0] while cnt < 4000: i = random.randint(0, ts_idx.shape[0]) #for i in range(ts_idx.shape[0]): record = np.zeros(1) #u = np.zeros(1) try: record = data[np.where(data[:, 0] == ts_idx[i])] #u = user[np.where(user[:, 0] == record[0, 0])][0,1:] except IndexError: continue else: for a in range(pool_size): x_feature[a] = app[a, 1:] x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) UCB[a] = utils.getUCB(theta, x_feature[a], beta, V) # print "this is app "+str(a)+" UCB is " + str(UCB[a]) action = UCB.argsort()[-K:][::-1] for ii in range(K): if expl[action[ii]] == 0: expl[action[ii]] = 1 # print expl reward = match_app.match(record, action) idx = [] val = [] if reward is not None: for j in reward: if reward[j] == 1 or reward[j] == 0: cnt = cnt + 1 idx.append(j) val.append(reward[j]) idx = np.asarray(idx) val = np.asarray(val) x_t = x_feature[idx, :] w = np.array(val.reshape(x_t.shape[0], 1)) # print w [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) click_n = click_n + utils.get_reward(w) result.append((float(click_n) / cnt)) else: continue print >> logFile, "the reward is: %s" % result print >> logFile, "cnt is: %s" % cnt expl_n = sum(expl) expl_n_rate = float(expl_n) / pool_size print >> logFile, "expl_n is %s" % str(expl_n) print >> logFile, "expl_rate is %s" % str(expl_n_rate) logFile.close()
def main(lamb, R): #read in file tmp = [] with open("../session.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() data = np.asarray(tmp) tmp = [] with open("../user_feature.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() user = np.asarray(tmp) tmp = [] with open("../app_feature.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() app = np.asarray(tmp) #initialization pool_size = app.shape[0] user_row_n = user.shape[1] app_row_n = app.shape[1] d = int((user_row_n - 1) * (app_row_n - 1)) session_n = int(max(data[:, 0])) train_ratio = 0.7 gamma = 1 # lamb = 0.1 theta = np.zeros(d) beta = 1 delta = 0.9 V = lamb * np.eye(d) X = np.zeros((1, d), dtype=np.float) Y = np.zeros(1) x_feature = np.zeros((pool_size, d), dtype=np.float) UCB = np.zeros(pool_size, dtype=np.float) session = np.zeros(session_n, dtype=np.float) K = 5 click_n = 0 idex = np.random.permutation(session_n) tr_idx = idex[:int(round(session_n * train_ratio))] ts_idx = idex[int(round(session_n * train_ratio)):] # open file for storing log info cur_time = strftime("%Y%m%d_", localtime()) logFileName = '../LogFile/main_' + cur_time + '.txt' logFile = open(logFileName, 'a+') print >> logFile, '\n\n' print >> logFile, '=' * 50 print >> logFile, 'experiment parameters: lambda=%f, R=%f' % (lamb, R) print >> logFile, '\n\n' #train app = app[:, 1:] expl = np.zeros(pool_size) for i in range(tr_idx.shape[0]): record = np.zeros(1) u = np.zeros(1) try: record = data[np.where(data[:, 0] == tr_idx[i])] u = user[np.where(user[:, 0] == record[0, 0])][0, 1:] except IndexError: continue else: for a in range(pool_size): x_feature[a] = np.outer(u, app[a, :]).reshape(1, d) x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) UCB[a] = utils.getUCB(theta, x_feature[a], beta, V) # print "this is app "+str(a)+" UCB is " + str(UCB[a]) action = UCB.argsort()[-K:][::-1] for ii in range(K): if expl[action[ii]] == 0: expl[action[ii]] = 1 #print expl reward = match_app.match(record, action) idx = [] val = [] if reward is not None: for j in reward: if reward[j] == 1 or reward[j] == 0: idx.append(j) val.append(reward[j]) idx = np.asarray(idx) val = np.asarray(val) x_t = x_feature[idx, :] w = np.array(val.reshape(x_t.shape[0], 1)) # print w [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) else: continue #test cnt = 0 result = [0] for i in range(ts_idx.shape[0]): record = np.zeros(1) u = np.zeros(1) try: record = data[np.where(data[:, 0] == ts_idx[i])] u = user[np.where(user[:, 0] == record[0, 0])][0, 1:] except IndexError: continue else: for a in range(pool_size): x_feature[a] = np.outer(u, app[a, :]).reshape(1, d) x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) UCB[a] = utils.getUCB(theta, x_feature[a], beta, V) # print "this is app "+str(a)+" UCB is " + str(UCB[a]) action = UCB.argsort()[-K:][::-1] for ii in range(K): if expl[action[ii]] == 0: expl[action[ii]] = 1 # print expl reward = match_app.match(record, action) idx = [] val = [] if reward is not None: for j in reward: if reward[j] == 1 or reward[j] == 0: cnt = cnt + 1 idx.append(j) val.append(reward[j]) idx = np.asarray(idx) val = np.asarray(val) x_t = x_feature[idx, :] w = np.array(val.reshape(x_t.shape[0], 1)) # print w [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) click_n = click_n + utils.get_reward(w) result.append((float(click_n) / cnt)) else: continue print >> logFile, "the reward is: %s" % result print >> logFile, "cnt is: %s" % cnt expl_n = sum(expl) expl_n_rate = float(expl_n) / pool_size print >> logFile, "expl_n is %s" % str(expl_n) print >> logFile, "expl_rate is %s" % str(expl_n_rate) logFile.close()
continue else: for a in range(pool_size): x_feature[a] = np.outer(u, app[a, :]).reshape(1, d) x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) UCB[a] = utils.getUCB(theta, x_feature[a], beta, V) action = UCB.argsort()[-K:][::-1] reward = match_app.match(record, action) idx = [] val = [] if reward is not None: for j in reward: if reward[j] == 1 or reward[j] == 0: cnt = cnt + 1 idx.append(j) val.append(reward[j]) idx = np.asarray(idx) val = np.asarray(val) x_t = x_feature[idx, :] w = np.array(val.reshape(x_t.shape[0], 1)) [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta) click_n = click_n + utils.get_reward(w) result.append((float(click_n) / cnt)) else: continue print "the reward is: %s" % result print "cnt is: %s" % cnt
def train(train_batch, train_label, hidden_size, test_length=5): ''' ############################## # modified from the original code by Nurakhmetov (2019) references: Nurakhmetov, D. (2019). Reinforcement Learning Applied to Adaptive Classification Testing. In Theoretical and Practical Advances in Computer-based Educational Measurement (pp. 325-336). Springer, Cham. ############################### ''' batch_size = 10 tests = [] tests_train = [] policy = Policy(n_tests=18, n_scores=1401, hidden_size=hidden_size) optimizer = optim.Adam(policy.parameters()) criterion = nn.CrossEntropyLoss(reduce=False) (score, test) = tuple(None, None) tests, scores = [], [] rewards = [] hidden = Variable(torch.zeros(batch_size, test), volatile=True) for t in range(test_length): logits, value, hidden, _ = policy(test, score, hidden, batch_size) probs = nn.functional.softmax(logits) #sample next item next_test = torch.multinomial(probs, 1) test = next_test.data.squeeze(1) score, test = utils.test_score(train_batch, test) masks = [] for prev_test in tests: mask = prev_test.squeeze(1).eq(test).unsqueeze(1) masks.append(mask) if len(masks) > 0: masks = torch.cat(masks, 1) masks = masks.sum(1).gt(0) masks = -1 * masks.float() rewards.append(masks.unsqueeze(1)) tests.append(test.unsqueeze(1)) scores.append(score.unsqueeze(1)) score = Variable(score.unsqueeze(1), volatile=True) test = Variable(test.unsqueeze(1), volatile=True) tests_train.append(tests) saved_log_probs = [] saved_values = [] hidden = Variable(torch.zeros(batch_size, tests)) logits, value, hidden, _ = policy(None, None, hidden, batch_size) log_probs = nn.functional.log_softmax(logits) for test, score in zip(test, scores): log_prob = log_probs.gather(1, Variable(test)) saved_log_probs.append(log_prob) saved_values.append(value) logits, value, hidden, clf_logits = policy(Variable(test), Variable(score), hidden, batch_size) log_probs = nn.functional.log_softmax(logits) loss = nn.functional.cross_entropy(clf_logits, Variable(train_label)) clf_rewards = [] for clf_logit, targ in zip(clf_logits.data, train_label): reward = -criterion(Variable(clf_logit.unsqueeze(0)), Variable(torch.LongTensor([targ]))).data clf_rewards.append(reward.unsqueeze(0)) clf_rewards = torch.cat(clf_rewards, 0).unsqueeze(-1) rewards.append(clf_rewards) returns = utils.get_reward(rewards) saved_log_probs = torch.cat(saved_log_probs, 1) saved_values = torch.cat(saved_values, 1) advantages = Variable(returns) - saved_values critic_loss = advantages.pow(2).mean() actor_loss = -(saved_log_probs * Variable(advantages.data)).mean() optimizer.zero_grad() (critic_loss + actor_loss + loss).backward() optimizer.step() return tests_train
def main(lamb, R): tmp = [] with open("../user_feature.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() user = np.asarray(tmp) tmp = [] with open("../app_feature.txt", "r") as f: for line in f: line = line.strip().rstrip(',').split(',') line = list(map(float, line)) tmp.append(line) f.close() app = np.asarray(tmp) #initialization B = 3 reward_acc = 0 cnt_acc = 0 cur_time = strftime("%Y%m%d_", localtime()) logFileName = '../LogFile/main_BT_nouser_' + cur_time + '.txt' logFile = open(logFileName, 'a+') print >> logFile, '\n\n' print >> logFile, '=' * 3 print >> logFile, 'experiment parameters: lambda=%f, R=%f' % (lamb, R) print >> logFile, '\n\n' for t in range(0, B): print "Round " + str(t + 1) + " starts!" data = readSess(t) sids = readDict(t) start = time.clock() pool_size = app.shape[0] user_row_n = user.shape[1] app_row_n = app.shape[1] d = app_row_n - 1 session_n = sids.shape[0] train_ratio = 0.7 gamma = 1 #lamb = 0.5 theta = np.zeros(d) beta = 1 delta = 0.9 V = lamb * np.eye(d) X = np.zeros((1, d), dtype=np.float) Y = np.zeros(1) x_feature = np.zeros((pool_size, d), dtype=np.float) UCB = np.zeros(pool_size, dtype=np.float) K = 5 click_n = 0 sids = np.random.permutation(sids) tr_idx = sids[:int(round(session_n * train_ratio))] ts_idx = sids[int(round(session_n * train_ratio)):] #train app = app[:, 1:] expl = np.zeros(pool_size) for i in range(tr_idx.shape[0]): record = np.zeros(1) #u = np.zeros(1) try: record = data[np.where(data[:, 0] == float(tr_idx[i]))] #u = user[np.where(user[:, 0] == record[0, 0])][0,1:] except IndexError: continue else: for a in range(pool_size): x_feature[a] = app[a, :] x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) UCB[a] = utils.getUCB(theta, x_feature[a], beta, V) #print "this is app "+str(a)+" UCB is " + str(UCB[a]) action = UCB.argsort()[-K:][::-1] for ii in range(K): if expl[action[ii]] == 0: expl[action[ii]] = 1 # print expl reward = match_app.match(record, action) idx = [] val = [] if reward is not None: for j in reward: if reward[j] == 1 or reward[j] == 0: idx.append(j) val.append(reward[j]) idx = np.asarray(idx) val = np.asarray(val) x_t = x_feature[idx, :] w = np.array(val.reshape(x_t.shape[0], 1)) print w [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) else: continue #test cnt = 0 result = [0] for i in range(ts_idx.shape[0]): record = np.zeros(1) #u = np.zeros(1) try: record = data[np.where(data[:, 0] == float(ts_idx[i]))] #u = user[np.where(user[:, 0] == record[0, 0])][0,1:] except IndexError: continue else: for a in range(pool_size): x_feature[a] = app[a, :] x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a])) UCB[a] = utils.getUCB(theta, x_feature[a], beta, V) #print "this is app "+str(a)+" UCB is " + str(UCB[a]) action = UCB.argsort()[-K:][::-1] for ii in range(K): if expl[action[ii]] == 0: expl[action[ii]] = 1 print expl reward = match_app.match(record, action) idx = [] val = [] if reward is not None: for j in reward: if reward[j] == 1 or reward[j] == 0: cnt = cnt + 1 idx.append(j) val.append(reward[j]) idx = np.asarray(idx) val = np.asarray(val) x_t = x_feature[idx, :] w = np.array(val.reshape(x_t.shape[0], 1)) print w [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R) click_n = click_n + utils.get_reward(w) result.append((float(click_n) / cnt)) else: continue print >> logFile, "the reward is: %s" % result[-1] print >> logFile, "cnt is: %s" % cnt expl_n = sum(expl) expl_n_rate = float(expl_n / pool_size) print >> logFile, "expl_n is %s" % str(expl_n) print >> logFile, "expl_rate is %s" % str(expl_n_rate) reward_acc += result[-1] cnt_acc += cnt reward_avg = reward_acc / B cnt_avg = cnt_acc / B print >> logFile, "the average cnt is: %s" % cnt_avg print >> logFile, "the average reward is: %s" % reward_avg logFile.close()
state_counts = defaultdict(lambda: 0) # The initial game state s = get_state(game, state_counts) assert(s.shape[1] == dx and s.shape[2] == dy and s.shape[3] == n_features) loss_avg = 0 total_return = 0 prev_serviced = [0] # List for easy mutable argument for step in range(episode_length): q, a = compute_action_value(model, s) # action-value and action taken take_action(game, a) # actually take the selected action s_prime = get_state(game, state_counts) # See what the new state is r = get_reward(game, prev_serviced) # See what the reward is total_return += r # Retroactively predict the value of the previous state using the reward # The terminal state should have a value of 0, so we add a check for that if step < episode_length - 1: q_prime, _ = model.predict_q(s_prime) q_prime *= gamma q_prime += r else: q_prime = np.array(r, dtype=np.float32, ndmin=1) mu = get_mu(state_counts, s) # Weight for importance of `s` # Gradient update for model towards q_prime loss = model.update(s, a, q_prime, mu, lr=lr)
def train_controller(self): """Fixes the shared parameters and updates the controller parameters. The controller is updated with a score function gradient estimator (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl is computed on a minibatch of validation data. A moving average baseline is used. The controller is trained for 2000 steps per epoch (i.e., first (Train Shared) phase -> second (Train Controller) phase). """ self.controller.train() # TODO(brendan): Why can't we call shared.eval() here? Leads to loss # being uniformly zero for the controller. # self.shared.eval() avg_reward_base = None baseline = None adv_history = [] entropy_history = [] reward_history = [] genomes_all = [] # hidden = self.shared.init_hidden(self.args.batch_size) total_loss = 0 genome_epochs = args.genome_epochs for step in range(args.controller_max_step): bp_args = [] log_probs_all = [] for i in range(args.nprocs): # sample models genome, log_probs, entropies = self.controller.sample() # keep track of all log_probs for each genome log_probs_all.append(log_probs) # calculate reward np_entropies = entropies.data.cpu().numpy() # append entropies for all models entropy_history.extend(np_entropies) # rewards, valid_loss, genome_model = utils.get_reward(genome, np_entropies, self.traits, self.x, self.y, self.x_val, self.y_val) bp_args.append((genome, np_entropies, self.traits, self.x, self.y, self.x_val, self.y_val, genome_epochs)) if args.nprocs == 1: rewards_batch = [utils.get_reward(*bp_args[0])] else: rewards_batch = self.pool.starmap(utils.get_reward, bp_args) for i, (rewards, valid_loss, genome, model, bp_iters) in enumerate(rewards_batch): genomes_all.append((genome, float(valid_loss), self.controller_step, bp_iters)) # check for the best model self.n_models += 1 if self.best_genome is None or genome.fitness > self.best_genome.fitness: self.best_genome = genome self.best_model = model # set the trained weights back to the set of vocab genes try: # print('*'*100) for module in model.modules[:-1]: id_ = module.id_ gene = model.genome.nodes[id_] key_orig = gene.key_orig gene_orig = self.vocab.get(key_orig, None) #print(gene_orig.parameters.keys()) gene_orig.save_parameters(module.function.state_dict()) #print(self.vocab[key_orig].parameters.keys()) except: ipdb.set_trace() """ hist = {'time': time.time(), 'loss': float(valid_loss), 'model': str(model), 'num_params': utils.num_params(model)} self.history.append(hist) # wirte history to JSON for further analysis with open(self.history_file, 'a') as fout: fout.write(json.dumps(hist) + '\n') """ # VIP: you get one reward per entropy # reward_history.extend(rewards) """ I have to do mean of entropies here because the number of entropies varies based on network depth: R = 10 - valid_loss rewards = R + 1e-4 * entropies """ rewards = np.mean(rewards) reward_history.append(rewards) # moving average baseline if baseline is None: baseline = rewards else: decay = 0.95 # ema_baseline_decay (very important) baseline = decay * baseline + (1 - decay) * rewards adv = rewards - baseline # adv_history.extend(adv) adv_history.append(adv) # policy loss # loss = -log_probs_all[i] * utils.get_variable(adv, args.cuda, requires_grad=False) loss = -log_probs_all[i] * utils.get_variable( [adv], args.cuda, requires_grad=False) # if args.entropy_mode == 'regularizer': # loss -= args.entropy_coeff * entropies loss = loss.sum() # or loss.mean() # update self.controller_optim.zero_grad() loss.backward() if args.controller_grad_clip > 0: to.nn.utils.clip_grad_norm( self.controller.model.parameters(), args.controller_grad_clip) self.controller_optim.step() self.controller_step += 1 assert self.controller_step == self.n_models, ( self.controller_step, self.n_models) total_loss += utils.to_item(loss.data) # if ((step % args.log_step) == 0) and (step > 0): # if ((step * args.nprocs % args.log_step) == 0) and (step > 0): if self.controller_step % args.log_step == 0: logger.info('-' * 100) logger.info( 'summarizing and resetting: reward_history, adv_history, entropy_history' ) logger.info( 'step: {}, controller_step: {}, n_models: {}, max_layers: {}' .format(step, self.controller_step, self.n_models, self.controller.max_layers)) logger.info( 'len(reward_history): {}, len(adv_history): {}, len(entropy_history): {}, len(genomes_all): {}' .format(len(reward_history), len(adv_history), len(entropy_history), len(genomes_all))) self._summarize_controller_train(total_loss, adv_history, entropy_history, reward_history, avg_reward_base) reward_history, adv_history, entropy_history = [], [], [] total_loss = 0 # update max number of layers (do it here so stats are comparable) self.controller.set_max_layers(self.controller.max_layers + 1) self.controller.save() if self.controller_step % args.log_step_genome == 0: self._summarize_best_genome(genomes_all) genomes_all = [] # check for stopping elapsed_time = time.time() - self.start_time if args.max_time is not None: if elapsed_time >= args.max_time: logger.info( 'Stopping b/c max time exceeded: {}'.format( elapsed_time)) return True else: n_gens = int(np.round(self.n_models / args.log_step_genome)) if n_gens >= args.max_generations + 1: logger.info( 'Stopping b/c max_generations: {}/{}. Run time: {}' .format(n_gens, args.max_generations, elapsed_time)) return True logger.info('Controller finished training within time: {}'.format( elapsed_time)) return True