def __init__(self, poi_info, user_KG, params): self.poi_info = poi_info self.user_KG = user_KG self.visit_counter = 0 self.ll = params.ll self.lc = params.lc self.lp = params.lp self.poi_cat_dict = poi_info.poi_cat_dict self.poi_loc_dict = poi_info.poi_loc_dict self.poi_dist_mat = poi_info.poi_dist_mat self.cat_sim_mat = poi_info.cat_sim_mat self.memory_capacity = params.memory_capacity self.environment = Environment(user_KG.s_u.shape[1], self.poi_info.env_nt_1, self.poi_info.env_nt_2) self.dqn = DQN(self.environment, user_KG.s_u.shape[1] + user_KG.s_KG.x.shape[1], user_KG.s_KG.num_POI, params.memory_capacity, params.lr, params.epsilon, params.batch_size, params.gamma, params.target_replace_iter, mode=params.priority_mode) self.predict_POI_index = np.random.randint(user_KG.s_KG.num_POI) self.r = reward(params.ll, params.lc, params.lp, self.predict_POI_index, 0, poi_info.poi_cat_dict, poi_info.poi_loc_dict, poi_info.poi_dist_mat, poi_info.cat_sim_mat)
def fit(self, train_loader): num_visits = len(train_loader) * train_loader.batch_size for batch_ndx, sample in enumerate(train_loader): for i in range(len(sample.user_list)): s_u_, s_KG_ = self.environment( self.user_KG.s_u, self.user_KG.s_KG, sample.temporal_list[i].view(self.poi_info.env_nt_1, self.poi_info.env_nt_2), sample.user_list[i], sample.poi_list[i]) self.dqn.store_transition( (self.user_KG.s_u[sample.user_list[i]], self.user_KG.s_KG), self.predict_POI_index, self.r, (s_u_[sample.user_list[i]], s_KG_)) if self.dqn.memory_counter > self.memory_capacity: self.dqn.learn() if self.visit_counter < num_visits - 1: self.user_KG.s_u = s_u_ self.user_KG.s_KG = s_KG_ self.visit_counter += 1 self.predict_POI_index = self.dqn.choose_action( (self.user_KG.s_u[sample.user_list[i]], self.user_KG.s_KG)) self.r = reward( self.ll, self.lc, self.lp, self.predict_POI_index, int(sample.poi_list[ self.visit_counter % (train_loader.batch_size * batch_ndx + 1)]), self.poi_cat_dict, self.poi_loc_dict, self.poi_dist_mat, self.cat_sim_mat) logging.info( "The {} training batch has been done!".format(batch_ndx))
x[:, 0] = np.array([pi / 2, pi / 2, 0, 0]) y = np.zeros([num_actions, max_it]) mean_reward = 0 success = False for i in range(1, max_it - 1): state = x[:, i] qval = Q.predict(state.reshape([1, n])) if (random.random() < rand_eps): action = np.random.randint(0, num_actions) else: action = np.argmax(qval) u = actions[action] new_state = utils.rung_kutta4(myarm, state.reshape([n, 1]), u, dt) x[:, i + 1] = new_state.reshape([4]) reward = utils.reward(new_state, u, target_state) mean_reward += reward / max_it newQ = Q.predict(new_state.reshape([1, n])) maxQ = np.max(newQ) y[:, i + 1] = qval[:] if utils.close_enough(new_state, target_state): bonus = 1000 else: bonus = 0 y[action, i] = reward + gamma * maxQ + bonus if i != 0 and i % batch_size == 0: Q.update(np.transpose(x[:, i - batch_size:i]), np.transpose(y[:, i - batch_size:i]))
#agent2 = cfr_plus_agent2.CFRAPlusgent(env, isAbs=False, CFR_num=1, tra_num=2) agent3 = RandomAgent(action_num=env.action_num) l = [] from rlcard.utils.logger import Logger root_path = './model_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' logger = Logger(xlabel='iteration', ylabel='exploitability', legend='DeepCFR+_model', log_path=log_path, csv_path=csv_path) r = utils.reward() ''' start = time.perf_counter() e1 = np.mean(r.computer_reward(agent0, agent2, evaluate_num*20, Process_num, eval_env)) e2 = np.mean(r.computer_reward(agent1, agent2, evaluate_num*20, Process_num, eval_env)) end = time.perf_counter() logger.log('eposide {}:{:.5f},{:.5f} test time:{}'.format(0, e1, e2, end-start)) ''' for i in range(100): start = time.perf_counter() agent0.deepCFR(i, 8) #agent1.train(i,8)#20*8*1*1 #agent2.train(i,8) e1 = np.mean( r.computer_reward(agent0, agent3, evaluate_num * 50, Process_num,
def play(action): ctr = s.realize(action) r, c = reward(ctr, s.gamma, s.disj) return r, c if cascade else [int(click) for click in ctr], s.regret(action)
action = utils.select_action(transition[pos[0],pos[1],:]) pos = utils.get_neighbors(pos,level)[action] path.append( pos ) actions.append( action ) if all( pos==finish ): completed = True break # #_ = input() # path = np.array(path) rv = utils.reward(path, completed, maxit) transition = utils.update_transitions(transition, path, actions, rv) pathlens.append( len(path) ) if j%1==0: utils.vis_path(ax,path) utils.vis_transition(ax,transition) for entry in ax.texts: entry.remove() bbox=dict(facecolor='k', edgecolor='black', alpha=0.3) ax.text(0,0,'iteration %i'%(j+1), fontsize=14, va='center', ha='left', color='w',bbox=bbox) # save frames for animation.
import numpy as np import matlab.engine from utils import reward step_size = .2 epsilon = 1.0e-3 n_steps = 2000 frac_sec = 0.5 action_set = np.linspace(-.25, .25, 11) for i in range(10): a = np.random.choice(action_set, 1)[0] frac_sec += a if frac_sec > 1 or frac_sec < 0: continue eng = matlab.engine.start_matlab() pressure_seq, prms = eng.sim_main(float(frac_sec), n_steps, nargout=2) pressure_seq = np.array(pressure_seq) prms = np.array(prms) r = reward(prms) print("Iteration: {}, Burn Fraction: {}, reward: {}".format( i, frac_sec, r)) eng.quit()