def objective(trial): start = time.time() # define model and fit alpha = trial.suggest_loguniform('alpha', 1e-6, 1) l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1) #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic']) model = SLIM_model.SLIM(alpha, l1_ratio, len(user_list), len(item_list), lin_model=lin_model) #model.fit(user_item_train_df) model.fit_multi(user_item_train_df) #model.load_sim_mat('./sim_mat.txt', user_item_train_df) # evaluate eval_model = evaluate.Evaluater(user_item_test_df, len(user_list)) ## predict model.predict() score_sum = 0 not_count = 0 for i in range(len(user_list)): rec_item_idx = model.pred_ranking(i) #score = eval_model.topn_precision(rec_item_idx, i) score = eval_model.topn_map(rec_item_idx, i) if score > 1: not_count += 1 continue score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) return -1 * (score_sum / (len(user_list) - not_count))
def objective(trial): start = time.time() data_dir = ['../data_luxury_5core/valid1/', '../data_luxury_5core/valid2/'] score_sum = 0 for data_path in data_dir: # dataload dataset = dataloader.AmazonDataset(data_path) edges = [[r[0], r[1]] for r in dataset.triplet_df.values] for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) #user_items_test_dict = pickle.load(open(data_path + 'user_items_test_dict.pickle', 'rb')) # グラフを作る G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) # ハイパラ alpha = trial.suggest_uniform('alpha', 0, 1) ranking_mat = get_ranking_mat(G, alpha, dataset) evaluater = evaluate.Evaluater(data_path) score = evaluater.topn_map(ranking_mat) score_sum += score mi, sec = time_since(time.time() - start) print('{}m{}s'.format(mi, sec)) return -1 * score_sum / 2
def objective(trial): start = time.time() data_dirs = [ '../' + data_path + '/valid1/bpr/', '../' + data_path + '/valid2/bpr/' ] score_sum = 0 # ハイパラ alpha = trial.suggest_loguniform('alpha', 1e-6, 1) l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1) #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic']) for data_dir in data_dirs: # データロード user_item_train_df = pd.read_csv(data_dir + 'user_item_train.csv') user_item_test_df = pd.read_csv(data_dir + 'user_item_test.csv') user_list = [] item_list = [] with open(data_dir + 'user_list.txt', 'r') as f: for l in f: user_list.append(l.replace('\n', '')) with open(data_dir + 'item_list.txt', 'r') as f: for l in f: item_list.append(l.replace('\n', '')) # define model and fit model = SLIM_model.SLIM(alpha, l1_ratio, len(user_list), len(item_list), lin_model=lin_model) model.fit(user_item_train_df) #model.fit_multi(user_item_train_df) #model.load_sim_mat('./sim_mat.txt', user_item_train_df) # evaluate eval_model = evaluate.Evaluater(user_item_test_df, len(user_list)) ## predict model.predict() _score_sum = 0 not_count = 0 for i in range(len(user_list)): rec_item_idx = model.pred_ranking(i) #score = eval_model.topn_precision(rec_item_idx, i) score = eval_model.topn_map(rec_item_idx, i) if score > 1: not_count += 1 continue _score_sum += score score_sum += _score_sum / (len(user_list) - not_count) mi, sec = time_since(time.time() - start) print('{}m{}sec'.format(mi, sec)) return -1 * score_sum / 2
def objective(trial): data_dirs = [ '../data_beauty_2core_es/valid1/bpr/', '../data_beauty_2core_es/valid2/bpr/' ] l1r = trial.suggest_loguniform('l1r', 1e-6, 1) l2r = trial.suggest_loguniform('l2r', 1e-6, 1) params = {'l1r': l1r, 'l2r': l2r} score_sum = 0 for data_dir in data_dirs: user_list = [] item_list = [] with open(data_dir + 'user_list.txt', 'r') as f: for l in f: user_list.append(l.replace('\n', '')) with open(data_dir + 'item_list.txt', 'r') as f: for l in f: item_list.append(l.replace('\n', '')) user_item_train_df = pd.read_csv(data_dir + 'user_item_train.csv') user_item_test_df = pd.read_csv(data_dir + 'user_item_test.csv') eval_model = evaluate.Evaluater(user_item_test_df, len(user_list)) # train slim trainmat = load_data(user_item_train_df, len(user_list), len(item_list)) model = SLIM() train(model, params, trainmat) model.save_model(modelfname='model.csr', mapfname='map.csr') # filename to save the item map # predict rec_mat = predict(user_item_train_df, len(user_list), len(item_list)) # eval start = time.time() map_sum = 0 not_count = 0 for i in range(rec_mat.shape[0]): rec_idx = np.argsort(rec_mat.getrow(i).toarray())[::-1] rec_idx = np.array(rec_idx)[0, :] score = eval_model.topn_map(rec_idx, i) if score > 1: not_count += 1 continue map_sum += score score_sum += map_sum / (len(user_list) - not_count) return -1 * score_sum / 2
def iterate_epoch(self, model, lr, epoch, optimizer='Adam', weight_decay=0, warmup=0, lr_decay_rate=1, lr_decay_every=10, eval_every=5, early_stop=False): if early_stop: #es = EarlyStop(self.data_dir[0:-10] + 'early_stopping/bpr', patience=6) es = EarlyStop('../data_beauty_2core_es/early_stopping/bpr', patience=6) eval_model = evaluate.Evaluater(self.data_dir) plot_loss_list = [] plot_score_list = [] for i in range(epoch): plot_loss_list.extend( self.iterate_train(model, lr=lr, optimizer=optimizer, weight_decay=weight_decay, print_every=500)) # early stop if early_stop: pre_model = es.early_stop(model) if pre_model: print('Early Stop eposh: {}'.format(i + 1)) return eval_model.topn_map(pre_model) # lrスケジューリング if i > warmup: if (i - warmup) % lr_decay_every == 0: lr = lr * lr_decay_rate if (i + 1) % eval_every == 0: #score = eval_model.topn_precision(model) score = eval_model.topn_map(model) plot_score_list.append(score) #print('epoch: {} precision: {}'.format(i, score)) print('epoch: {} map: {}'.format(i, score)) #self._plot(plot_loss_list) #self._plot(plot_score_list) # とりあえず最後のepochのscoreを返す # return eval_model.topn_precision(model) #return eval_model.topn_map(model) return eval_model.topn_map(model)
def analysis(): corpus_list = data_structure.xml_dir_to_corpus_list(all_xml_dir) ''' for corpus in corpus_list: for span in corpus.edu_spans: print corpus.text[span[0]:span[-1]+1].encode('utf-8') ''' with open(train_corpus_list_file, "rb") as myFile: corpus_list = pickle.load(myFile) with open(test_corpus_list_file, "rb") as myFile: test_corpus_list = pickle.load(myFile) evaluater = evaluate.Evaluater() evaluater.show_relation_distribution_from_corpus_list(corpus_list)
def iterate_epoch(self, model, lr, epoch, optimizer='Adam', weight_decay=0, warmup=0, lr_decay_rate=1, lr_decay_every=10, eval_every=5): eval_model = evaluate.Evaluater(self.data_dir) plot_loss_list = [] plot_score_list = [] for i in range(epoch): plot_loss_list.extend( self.iterate_train(model, lr=lr, optimizer=optimizer, weight_decay=weight_decay, print_every=500)) # lrスケジューリング if i > warmup: if (i - warmup) % lr_decay_every == 0: lr = lr * lr_decay_rate if (i + 1) % eval_every == 0: #score = eval_model.topn_precision(model) score = eval_model.topn_map(model) plot_score_list.append(score) #print('epoch: {} precision: {}'.format(i, score)) print('epoch: {} map: {}'.format(i, score)) #self._plot(plot_loss_list) #self._plot(plot_score_list) # とりあえず最後のepochのscoreを返す return eval_model.topn_map(model)
def test_from_corpus_list(model, corpus_list): evaluater = evaluate.Evaluater() evaluater2 = evaluate.Evaluater() evaluater3 = evaluate.Evaluater() count = 0 print 'vocab_size', len(data_structure.word_to_ix) for corpus in corpus_list: #if count == int(sys.argv[1]): # continue #print corpus.id #count += 1 test_instance = data_structure.corpus_to_test_instance(corpus, binary=True) # multinuclear gold instnace gold_multi_instance = data_structure.corpus_to_test_instance( corpus, binary=False) gold_binary_instance = data_structure.corpus_to_test_instance( corpus, binary=True) #print test_instance.fragments test_instance.fragments = text_to_nn_word_list(test_instance.fragments) model.zero_grad() #print 'test_instance' #data_structure.print_test_instance(test_instance) if lstm_crf_only_flag: _, labels = model(test_instance.fragments) #return_instance = data_structure.labels_to_words_in_test_instance(labels, instance) else: return_instance = model(test_instance) #print 'corpus', #data_structure.print_corpus(corpus) return_instance.i_relations = data_structure.relations_to_post_order( return_instance.i_relations) ''' print 'return_instance' data_structure.print_test_instance(return_instance) print 'gold_binary_instance' data_structure.print_test_instance(gold_binary_instance) print 'gold_multi_instance' data_structure.print_test_instance(gold_multi_instance) ''' evaluater_tmp1 = evaluate.Evaluater() evaluater_tmp1.collect_eval_data(gold_binary_instance, return_instance) evaluater_tmp1.show_single_eval_result(gold_binary_instance, return_instance) evaluater.collect_eval_data(gold_binary_instance, return_instance) evaluater2.collect_eval_data(gold_multi_instance, return_instance) return_instance.du_i_relations = \ data_structure.relations_binary_to_multi_preorder(return_instance.du_i_relations) evaluater3.collect_eval_data(gold_multi_instance, return_instance) evaluater_tmp2 = evaluate.Evaluater() evaluater_tmp2.collect_eval_data(gold_multi_instance, return_instance) evaluater_tmp2.show_single_eval_result(gold_multi_instance, return_instance) evaluater.show_eval_result() evaluater2.show_eval_result() evaluater3.show_eval_result() return
for l in f: user_list.append(l.replace('\n', '')) with open(data_dir + 'item_list.txt', 'r') as f: for l in f: item_list.append(l.replace('\n', '')) lin_model = 'elastic' model = SLIM_model.SLIM(alpha, l1_ratio, len(user_list), len(item_list), lin_model=lin_model) model.fit_multi(user_item_train_df) #model.load_sim_mat('./sim_mat.txt', user_item_train_df) eval_model = evaluate.Evaluater(user_item_test_df, len(user_list)) model.predict() score_sum = 0 not_count = 0 for i in range(len(user_list)): rec_item_idx = model.pred_ranking(i) #score = eval_model.topn_precision(rec_item_idx, i) score = eval_model.topn_map(rec_item_idx, i) if score > 1: not_count += 1 continue score_sum += score score = -1 * (score_sum / (len(user_list) - not_count)) print(score) np.savetxt('score.txt', np.array([score]))
entity_list.append(l.replace('\n', '')) with open(data_path + '/user_list.txt', 'r') as f: for l in f: user_list.append(l.replace('\n', '')) with open(data_path + '/item_list.txt', 'r') as f: for l in f: item_list.append(l.replace('\n', '')) user_idx = [entity_list.index(u) for u in user_list] dataset = dataloader.AmazonDataset(data_path) # グラフを作る # user-itemとitem-userどちらの辺も追加 edges = [[r[0], r[1]] for r in dataset.triplet_df.values] for r in dataset.triplet_df.values: if r[2] == 0: edges.append([r[1], r[0]]) # load network G = nx.DiGraph() G.add_nodes_from([i for i in range(len(dataset.entity_list))]) G.add_edges_from(edges) alpha = 0.85 ranking_mat = get_ranking_mat(alpha) evaluater = evaluate.Evaluater(data_path) score = evaluater.topn_map(ranking_mat) np.savetxt('score.txt', np.array([score]))
import dataloader import evaluate import model import training from influence import get_influence import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import optuna data_dir = './data' dataset = dataloader.AmazonDataset(data_dir) evaluater = evaluate.Evaluater(data_dir) # 学習済みモデル読み込み mf = torch.load('model.torch') # loss_func定義 loss_func = nn.BCELoss() # 任意のテストデータ[u_i, i_i]ロード # あるユーザに対してランキング上位のアイテム target_user = dataset.user_list[0] target_user = dataset.user_list.index(target_user) ranking_idx = evaluater.predict(mf, target_user) target_item = ranking_idx[0] # trainingデータのinfluenceを計算 ## [u_i, i_i]のどちらかを含むtrainingデータを持ってくる