def main(): testbase = 'ub' dataset_name = "u1m.data" f = open('../testres.txt', 'a+') k_val = [5] per = [30] #per = [1.4] #metric = 'pr' metric = 'mae' #k_val = [5 9, 13, 17, 21] test_name = "New User-based Test on: " + dataset_name + ' ' if item_based: testbase = 'ib' test_name = "New Item-based test on: " + dataset_name + ' ' iterate = product(per, k_val) for per, k in iterate: f.write('\n') timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) f.write(test_name + timestamp + ' --> ') e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, eval_metric=metric) f.write( str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ') f.write(str(e.evaluate())) f.close()
def main(): testbase = 'ub' dataset_name = "u1m.data" f = open('../testres.txt', 'a+') k_val = [5] per = [30] #per = [1.4] #metric = 'pr' metric = 'mae' #k_val = [5 9, 13, 17, 21] test_name = "New User-based Test on: " + dataset_name + ' ' if item_based: testbase = 'ib' test_name = "New Item-based test on: " + dataset_name + ' ' iterate = product(per, k_val) for per, k in iterate: f.write('\n') timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) f.write(test_name + timestamp + ' --> ') e = Evaluater(dataset_name, rec_type=testbase, k=k, test_percentage=per, eval_metric=metric) f.write(str([per, k, e.eval_metric, e.sim_method.func_name]) + ' Error: ') f.write(str(e.evaluate())) f.close()
def iterate_epoch(self, model, lr, epoch, weight_decay=0, warmup=0, lr_decay_rate=1, lr_decay_every=10, eval_every=5, early_stop=False): eval_model = Evaluater(self.data_dir, model_name=self.model_name) #es = EarlyStop(self.data_dir[0:-6] + 'early_stopping/', self.model_name, patience=6) es = EarlyStop('../data_beauty_2core_es/early_stopping/', self.model_name, patience=6) plot_loss_list = [] plot_score_list = [] for i in range(epoch): plot_loss_list.extend( self.iterate_train(model, lr=lr, weight_decay=weight_decay, print_every=10000)) # early stop if early_stop: pre_model = es.early_stop(model) if pre_model: print('Early Stop eposh: {}'.format(i + 1)) return eval_model.topn_map(pre_model) # lrスケジューリング if i > warmup: if (i - warmup) % lr_decay_every == 0: lr = lr * lr_decay_rate if (i + 1) % eval_every == 0: #score = eval_model.topn_precision(model) #print('epoch: {} precision: {}'.format(i, score)) score = eval_model.topn_map(model) print('epoch: {} map: {}'.format(i, score)) plot_score_list.append(score) #self._plot(plot_loss_list) #self._plot(plot_score_list) #return eval_model.topn_precision(model) return eval_model.topn_map(model)
def cal_loss_redundancy(resource, availability, replication, lease): replication_count = 0 data_loss = 0 data_count = 100 data_iterated = 0 lease_period = int(lease) desired_availability = float(availability) iteration = 0 bad_iteration = 0 if replication == 'random1copy' or replication == 'min1copy': copy_number = 1 elif replication == 'random2copy' or replication == 'min2copy': copy_number = 2 elif replication == 'random3copy' or replication == 'min3copy': copy_number = 3 while iteration < 30: quantile = int(range_minute * 0.05) time_point = random.randint(start_minute + quantile, end_minute - quantile) job_count = len(interval_tree[time_point]) # evaluate sizes of data set and job set if job_count < data_count * 3 * copy_number: # print "Error : job set is less than 3 times of data set" bad_iteration += 1 continue availability_dict = dict() for data_index in range(data_count): data_name = "data" + str(data_index) availability_dict[data_name] = desired_availability match_job_dict = scheduler.schedule(time_point, lease_period, availability_dict) if not match_job_dict: # print "Error : match_job_dict is none" bad_iteration += 1 continue for job in match_job_dict: replication_count += len(match_job_dict[job]) evaluater = Evaluater(interval_tree, job_dict) data_iterated = data_iterated + data_count data_loss += evaluater.evaluate(time_point + lease_period, match_job_dict) iteration += 1 data_loss_rate = float(data_loss) / float(data_iterated) redundancy_rate = float(replication_count) / float(data_iterated) print "data loss rate : ", data_loss_rate print "redundancy : ", redundancy_rate print "bad iteration : ", bad_iteration return (data_loss_rate, redundancy_rate)
def __init__(self, data_dir): # 本当はAmazonDatasetクラスを渡した方が速いが、 self.evaluater = Evaluater(data_dir) self.dataset = AmazonDataset(data_dir, model_name='TransE') edges = [[r[0], r[1]] for r in self.dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in self.dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) # load network self.G = nx.DiGraph() self.G.add_nodes_from( [i for i in range(len(self.dataset.entity_list))]) self.G.add_edges_from(edges)
def objective(trial): start = time.time() # ハイパラ読み込み # gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3) # lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic']) alpha = trial.suggest_uniform('alpha', 0, 1) beta = trial.suggest_uniform('beta', 0, 0.5) data_dirs = [ '../' + data_path + '/valid1/', '../' + data_path + '/valid2/' ] score_sum = 0 for data_dir in data_dirs: # dataload dataset = AmazonDataset(data_dir) # laod model #slim = train_SLIM(data_dir, load=True) sim_mat = load_sim_mat('sim_mat' + data_dir[-2] + '.csr', len(dataset.user_list), len(dataset.item_list)) edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) # load network G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) evaluater = Evaluater(data_dir) #ranking_mat = get_ranking_mat(G, slim, alpha, beta, dataset) ranking_mat = get_ranking_mat(G, sim_mat, alpha, beta, dataset) #score = evaluater.topn_map(ranking_mat) score = evaluater.topn_precision(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}s'.format(mi, sec)) return -1 * score_sum / 2
def objective(trial): start = time.time() # hyper parameter #gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3) #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic']) #slim = train_SLIM(lin_model, gamma) alpha = trial.suggest_uniform('alpha', 0, 0.5) beta = trial.suggest_uniform('beta', 0, 0.5) gamma1 = trial.suggest_uniform('gamma1', 0, 1) gamma2 = trial.suggest_uniform('gamma2', 0, 1) gamma3 = trial.suggest_uniform('gamma3', 0, 1) gamma = [gamma1, gamma2, gamma3] data_dir = ['../data_luxury_5core/valid1', '../data_luxury_5core/valid2'] score_sum = 0 for i in range(len(data_dir)): # dataload dataset = AmazonDataset(data_dir[i], model_name='TransE') edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) #user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb')) # load network G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta) #score = topn_precision(ranking_mat, user_items_test_dict) evaluater = Evaluater(data_dir[i]) score = evaluater.topn_map(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) return -1 * score_sum / 2
def objective(trial): start = time.time() # hyper parameter alpha = trial.suggest_uniform('alpha', 0, 0.5) beta = trial.suggest_uniform('beta', 0, 0.5) gamma1 = trial.suggest_uniform('gamma1', 0, 1) gamma2 = trial.suggest_uniform('gamma2', 0, 1) gamma3 = trial.suggest_uniform('gamma3', 0, 1) gamma = [gamma1, gamma2, gamma3] data_dir = ['../' + data_path + '/valid1', '../' + data_path + '/valid2'] score_sum = 0 for i in range(len(data_dir)): # dataload dataset = AmazonDataset(data_dir[i], model_name='SparseTransE') # load network edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) ranking_mat = get_ranking_mat(G, dataset, model[i], gamma, alpha, beta) #score = topn_precision(ranking_mat, user_items_test_dict) evaluater = Evaluater(data_dir[i]) score = evaluater.topn_map(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) return -1 * score_sum / 2
# load param params = load_params() alpha = params['alpha'] beta = params['beta'] gamma1 = params['gamma1'] gamma2 = params['gamma2'] gamma3 = params['gamma3'] gamma = [gamma1, gamma2, gamma3] # dataload dataset = AmazonDataset(data_dir, model_name='TransE') # load network edges = [[r[0], r[1]] for r in dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) ranking_mat = get_ranking_mat(G, dataset, model, gamma, alpha, beta) evaluater = Evaluater(data_dir) score = evaluater.topn_map(ranking_mat) mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) np.savetxt('score_transe3.txt', np.array([score]))
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new= record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
def main(p): start = time.time() # 选择文件名以'json.gz'结尾的记录 file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p)) # TODO 添加文件是否是24个的判断(glob模块) for file_name in file_name_list: with open(os.path.join(p, file_name), 'r') as f: raw_json_file = gzip.GzipFile(fileobj=f) record_cleaner = Cleaner() record_grouper = Grouper(db) record_normalizer = Normalizer(db) mongo_helper = MongoHelper(db) counter = ActorCounter() evaluater = Evaluater() # 数据清洗 record_cleaner.set_dirty_data(raw_json_file) record_cleaner.clean() clean_record = record_cleaner.get_clean_data() log.log('clean record %s' % len(clean_record)) # 数据处理 # 分组 record_grouper.set_records(clean_record) record_grouper.group() record_actor_exist = record_grouper.get_group_1() record_actor_new = record_grouper.get_group_2() log.log('record_actor_exist: %s' % len(record_actor_exist)) log.log('record_actor_new: %s' % len(record_actor_new)) # 处理记录的actor已存在的记录 log.log('Begin processing actor-exist records...') # 只需要删掉记录的actor_attrs即可 for record in record_actor_exist: del record['actor_attributes'] log.log('Finished.') # 处理记录的actor不存在的记录 record_normalizer.set_records(record_actor_new) record_normalizer.normalize() record_actor_new = record_normalizer.get_record_actor_new() new_actors = record_normalizer.get_new_actors() # 把本地的今日新增的Actor更新到数据库 actors = new_actors.values() mongo_helper.insert_new_actors(actors) # 对新增的Actor, 改变Redis中相应的计数 counter.count_actor_list(actors) # 计算每条记录的val evaluater.set_records(record_actor_exist) evaluater.evaluate() val_actor_exist = evaluater.get_val_cache() evaluater.set_records(record_actor_new) evaluater.evaluate() val_actor_new = evaluater.get_val_cache() # 将记录插入数据库 mongo_helper.insert_new_reocrds(record_actor_new) mongo_helper.insert_new_reocrds(record_actor_exist) # 将今日用户新增的val更新到数据库 mongo_helper.update_val(val_actor_new) mongo_helper.update_val(val_actor_exist) record_cleaner.free_mem() del record_cleaner del record_grouper del record_normalizer del mongo_helper del counter del evaluater # 生成CSV文件 util.grcount2csv() end = time.time() log.log('total: %s s' % (end - start))
# dataload data_dir = '../data_luxury_5core/test/' dataset = AmazonDataset(data_dir) # laod model slim_param = pickle.load(open('best_param_slim.pickle', 'rb')) slim = train_SLIM2(data_dir, slim_param) # load network edges = [[r[0], r[1]] for r in dataset.triplet_df.values] ## user-itemとitem-userどちらの辺も追加 for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) evaluater = Evaluater(data_dir) model_mat = load_sim_mat('sim_mat_test.csr', len(dataset.user_list), len(dataset.item_list)) ranking_mat = get_ranking_mat(G, model_mat, alpha, beta, dataset) #score = evaluater.topn_map(ranking_mat) score = evaluater.topn_precision(ranking_mat) mi, sec = time_since(time.time() - start) print('{}m{}s'.format(mi, sec)) np.savetxt('score.txt', np.array([score]))
class Inference(): def __init__(self, data_dir): # 本当はAmazonDatasetクラスを渡した方が速いが、 self.evaluater = Evaluater(data_dir) self.dataset = AmazonDataset(data_dir, model_name='TransE') edges = [[r[0], r[1]] for r in self.dataset.triplet_df.values] # user-itemとitem-userどちらの辺も追加 for r in self.dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) # load network self.G = nx.DiGraph() self.G.add_nodes_from( [i for i in range(len(self.dataset.entity_list))]) self.G.add_edges_from(edges) def get_score(self, model, gamma, alpha, beta): ranking_mat = self.get_ranking_mat(model, gamma, alpha, beta) score = self.evaluater.topn_map(ranking_mat) return score def mk_sparse_sim_mat(self, model, gamma): item_idx = torch.tensor([ self.dataset.entity_list.index(i) for i in self.dataset.item_list ], dtype=torch.long, device=device) user_idx = torch.tensor([ self.dataset.entity_list.index(u) for u in self.dataset.user_list ], dtype=torch.long, device=device) brand_idx = torch.tensor([ self.dataset.entity_list.index(b) for b in self.dataset.brand_list ], dtype=torch.long, device=device) # ここもっと上手く書きたい item_embed = model.entity_embed(item_idx) item_sim_mat = F.relu(torch.mm(item_embed, torch.t(item_embed))) item_sim_mat = gamma[0] * scipy.sparse.csr_matrix( item_sim_mat.to('cpu').detach().numpy().copy()) user_embed = model.entity_embed(user_idx) user_sim_mat = F.relu(torch.mm(user_embed, torch.t(user_embed))) user_sim_mat = gamma[1] * scipy.sparse.csr_matrix( user_sim_mat.to('cpu').detach().numpy().copy()) brand_embed = model.entity_embed(brand_idx) brand_sim_mat = F.relu(torch.mm(brand_embed, torch.t(brand_embed))) brand_sim_mat = gamma[2] * scipy.sparse.csr_matrix( brand_sim_mat.to('cpu').detach().numpy().copy()) M = scipy.sparse.block_diag( (item_sim_mat, user_sim_mat, brand_sim_mat)) M_ = np.array(1 - M.sum(axis=1) / np.max(M.sum(axis=1))) M = M / np.max(M.sum(axis=1)) + scipy.sparse.diags(M_.transpose()[0]) return M def pagerank_scipy(self, sim_mat, personal_vec=None, alpha=0.85, beta=0.01, max_iter=100, tol=1.0e-6, weight='weight', dangling=None): N = len(self.G) if N == 0: return {} nodelist = self.G.nodes() M = nx.to_scipy_sparse_matrix(self.G, nodelist=nodelist, weight=weight, dtype=float) S = scipy.array(M.sum(axis=1)).flatten() S[S != 0] = 1.0 / S[S != 0] Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr') M = Q * M # 遷移行列とsim_matを統合 #sim_mat = mk_sparse_sim_mat(G, item_mat) M = beta * M + (1 - beta) * sim_mat # initial vector x = scipy.repeat(1.0 / N, N) # Personalization vector p = personal_vec dangling_weights = p is_dangling = scipy.where(S == 0)[0] #print(x.shape) #print(M.shape) #print(p.shape) ppr_mat = [] for i in range(p.shape[1]): ppr = self.power_iterate(N, M, x, p[:, i], dangling_weights[:, i], is_dangling, alpha, max_iter, tol) ppr_mat.append(ppr) #if i > 100: # print(np.array(ppr_mat).shape) # break return np.array(ppr_mat) def power_iterate(self, N, M, x, p, dangling_weights, is_dangling, alpha, max_iter=500, tol=1.0e-6): #print(M.shape) #print(x.shape) #print(p.shape) # power iteration: make up to max_iter iterations for i in range(max_iter): xlast = x x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \ (1 - alpha) * p # check convergence, l1 norm x = x / x.sum() err = scipy.absolute(x - xlast).sum() if err < N * tol: #return dict(zip(nodelist, map(float, x))) #print(i) return x # pagerankの収束ちゃんとやっとく #print(x.sum()) #print(err) #print(N * tol) #raise NetworkXError('pagerank_scipy: power iteration failed to converge ' #'in %d iterations.' % max_iter) #return dict(zip(nodelist, map(float, x))) return x def item_ppr(self, sim_mat, alpha, beta): # personal_vecを作る(eneity_size * user_size) user_idx = [ self.dataset.entity_list.index(u) for u in self.dataset.user_list ] personal_vec = [] for u in user_idx: val = np.zeros(len(self.G.nodes())) val[u] = 1 personal_vec.append(val[np.newaxis, :]) personal_vec = np.concatenate(personal_vec, axis=0).transpose() #ppr = pagerank_torch(G, sim_mat, personal_vec, alpha, beta) ppr = self.pagerank_scipy(sim_mat, personal_vec, alpha, beta) item_idx = [ self.dataset.entity_list.index(i) for i in self.dataset.item_list ] pred = ppr[:, item_idx] #print(pred.shape) return pred def get_ranking_mat(self, model, gamma, alpha=0.85, beta=0.01): ranking_mat = [] #sim_mat = reconstruct_kg(model) sim_mat = self.mk_sparse_sim_mat(model, gamma) pred = self.item_ppr(sim_mat, alpha, beta) #print(pred.shape) for i in range(len(self.dataset.user_list)): sorted_idx = np.argsort(np.array(pred[i]))[::-1] ranking_mat.append(sorted_idx) #break return ranking_mat
#percentage = [10, 20, 30] #k = [5] #percentage = [10] prod = product(datafiles, k, metrics, r_method, n) for filename, k, metric, r_m, n in prod: if filename == 'datahouse': X, y = getXy(filename) else: X, y = getXy(filename, delimiter=',') if n: normalize(X) print 'data is normalized' else: print 'data is not normalized' reg = KNeighborClassifier(X, y, k, metric, r_method=r_m) e = Evaluater(reg, test_percentage = 30) print '\t', e.evaluate() print '\n' #reg = KNeighborClassifier(X, y, m, sim_cosine, r_method="uniform") #e = Evaluater(reg, test_percentage = l) #print 'Test: k=%s, test/all_data=%s' % (m, l) #print "\t", e.evaluate() #del reg, e
bad_iteration += 1 continue availability_dict = dict() for data_index in range(data_count): data_name = "data" + str(data_index) availability_dict[data_name] = desired_availability # print availability_dict match_job_dict = scheduler.schedule(time_point, lease_period, availability_dict) if not match_job_dict: print "Error : match_job_dict is none" bad_iteration += 1 continue # for job in match_job_dict: # print job, match_job_dict[job] evaluater = Evaluater(interval_tree, job_dict) data_iterated = data_iterated + data_count data_loss += evaluater.evaluate(time_point + lease_period, match_job_dict) iteration += 1 print "data_loss : ", data_loss print "data_iterated : ", data_iterated print "iteration, bad_iteration : ", iteration, bad_iteration loss_rate_list.append(float(data_loss) / float(data_iterated)) print loss_rate_list print loss_rate_list resource_dict = {'SU-OG-CE': 'suogce', 'GLOW': 'glow', 'MWT2': 'mwt2'} avail_dict = {'0.99': '099', '0.90': '090', '0.80': '080'} file_name = resource_dict[sys.argv[1]] + '_avail_' + avail_dict[sys.argv[ 2]] + '_replication_' + sys.argv[3] + '_lease_' + sys.argv[4] + '.txt'