Example #1
0
def objective(trial):
    start = time.time()
    # define model and fit
    alpha = trial.suggest_loguniform('alpha', 1e-6, 1)
    l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)
    #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    
    model = SLIM_model.SLIM(alpha, l1_ratio, len(user_list), len(item_list), lin_model=lin_model)
    #model.fit(user_item_train_df)
    model.fit_multi(user_item_train_df)
    #model.load_sim_mat('./sim_mat.txt', user_item_train_df)

    # evaluate
    eval_model = evaluate.Evaluater(user_item_test_df, len(user_list))
    ## predict
    model.predict()

    score_sum = 0
    not_count = 0
    for i in range(len(user_list)):
        rec_item_idx = model.pred_ranking(i)
        #score = eval_model.topn_precision(rec_item_idx, i)
        score = eval_model.topn_map(rec_item_idx, i)
        if score > 1:
            not_count += 1
            continue
        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))

    return -1 * (score_sum / (len(user_list) - not_count))
Example #2
0
def objective(trial):
    start = time.time()

    data_dir = ['../data_luxury_5core/valid1/', '../data_luxury_5core/valid2/']
    score_sum = 0

    for data_path in data_dir:
        # dataload
        dataset = dataloader.AmazonDataset(data_path)
        edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
        for r in dataset.triplet_df.values:
            if r[2] == 0:
                edges.append([r[1], r[0]])

        #user_items_test_dict = pickle.load(open(data_path + 'user_items_test_dict.pickle', 'rb'))

        # グラフを作る
        G = nx.DiGraph()
        G.add_nodes_from([i for i in range(len(dataset.entity_list))])
        G.add_edges_from(edges)

        # ハイパラ
        alpha = trial.suggest_uniform('alpha', 0, 1)

        ranking_mat = get_ranking_mat(G, alpha, dataset)
        evaluater = evaluate.Evaluater(data_path)
        score = evaluater.topn_map(ranking_mat)
        score_sum += score

    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))
    return -1 * score_sum / 2
Example #3
0
def objective(trial):
    start = time.time()

    data_dirs = [
        '../' + data_path + '/valid1/bpr/', '../' + data_path + '/valid2/bpr/'
    ]
    score_sum = 0

    # ハイパラ
    alpha = trial.suggest_loguniform('alpha', 1e-6, 1)
    l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)
    #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])

    for data_dir in data_dirs:
        # データロード
        user_item_train_df = pd.read_csv(data_dir + 'user_item_train.csv')
        user_item_test_df = pd.read_csv(data_dir + 'user_item_test.csv')
        user_list = []
        item_list = []
        with open(data_dir + 'user_list.txt', 'r') as f:
            for l in f:
                user_list.append(l.replace('\n', ''))

        with open(data_dir + 'item_list.txt', 'r') as f:
            for l in f:
                item_list.append(l.replace('\n', ''))

        # define model and fit
        model = SLIM_model.SLIM(alpha,
                                l1_ratio,
                                len(user_list),
                                len(item_list),
                                lin_model=lin_model)
        model.fit(user_item_train_df)
        #model.fit_multi(user_item_train_df)
        #model.load_sim_mat('./sim_mat.txt', user_item_train_df)

        # evaluate
        eval_model = evaluate.Evaluater(user_item_test_df, len(user_list))
        ## predict
        model.predict()

        _score_sum = 0
        not_count = 0
        for i in range(len(user_list)):
            rec_item_idx = model.pred_ranking(i)
            #score = eval_model.topn_precision(rec_item_idx, i)
            score = eval_model.topn_map(rec_item_idx, i)
            if score > 1:
                not_count += 1
                continue
            _score_sum += score

        score_sum += _score_sum / (len(user_list) - not_count)

    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))

    return -1 * score_sum / 2
Example #4
0
def objective(trial):
    data_dirs = [
        '../data_beauty_2core_es/valid1/bpr/',
        '../data_beauty_2core_es/valid2/bpr/'
    ]

    l1r = trial.suggest_loguniform('l1r', 1e-6, 1)
    l2r = trial.suggest_loguniform('l2r', 1e-6, 1)

    params = {'l1r': l1r, 'l2r': l2r}

    score_sum = 0
    for data_dir in data_dirs:
        user_list = []
        item_list = []
        with open(data_dir + 'user_list.txt', 'r') as f:
            for l in f:
                user_list.append(l.replace('\n', ''))

        with open(data_dir + 'item_list.txt', 'r') as f:
            for l in f:
                item_list.append(l.replace('\n', ''))

        user_item_train_df = pd.read_csv(data_dir + 'user_item_train.csv')
        user_item_test_df = pd.read_csv(data_dir + 'user_item_test.csv')
        eval_model = evaluate.Evaluater(user_item_test_df, len(user_list))

        # train slim
        trainmat = load_data(user_item_train_df, len(user_list),
                             len(item_list))
        model = SLIM()
        train(model, params, trainmat)
        model.save_model(modelfname='model.csr',
                         mapfname='map.csr')  # filename to save the item map

        # predict
        rec_mat = predict(user_item_train_df, len(user_list), len(item_list))

        # eval
        start = time.time()
        map_sum = 0
        not_count = 0
        for i in range(rec_mat.shape[0]):
            rec_idx = np.argsort(rec_mat.getrow(i).toarray())[::-1]
            rec_idx = np.array(rec_idx)[0, :]

            score = eval_model.topn_map(rec_idx, i)

            if score > 1:
                not_count += 1
                continue
            map_sum += score

        score_sum += map_sum / (len(user_list) - not_count)

    return -1 * score_sum / 2
Example #5
0
    def iterate_epoch(self,
                      model,
                      lr,
                      epoch,
                      optimizer='Adam',
                      weight_decay=0,
                      warmup=0,
                      lr_decay_rate=1,
                      lr_decay_every=10,
                      eval_every=5,
                      early_stop=False):

        if early_stop:
            #es = EarlyStop(self.data_dir[0:-10] + 'early_stopping/bpr', patience=6)
            es = EarlyStop('../data_beauty_2core_es/early_stopping/bpr',
                           patience=6)

        eval_model = evaluate.Evaluater(self.data_dir)
        plot_loss_list = []
        plot_score_list = []

        for i in range(epoch):
            plot_loss_list.extend(
                self.iterate_train(model,
                                   lr=lr,
                                   optimizer=optimizer,
                                   weight_decay=weight_decay,
                                   print_every=500))

            # early stop
            if early_stop:
                pre_model = es.early_stop(model)
                if pre_model:
                    print('Early Stop eposh: {}'.format(i + 1))
                    return eval_model.topn_map(pre_model)

            # lrスケジューリング
            if i > warmup:
                if (i - warmup) % lr_decay_every == 0:
                    lr = lr * lr_decay_rate

            if (i + 1) % eval_every == 0:
                #score = eval_model.topn_precision(model)
                score = eval_model.topn_map(model)
                plot_score_list.append(score)
                #print('epoch: {}  precision: {}'.format(i, score))
                print('epoch: {}  map: {}'.format(i, score))

        #self._plot(plot_loss_list)
        #self._plot(plot_score_list)

        # とりあえず最後のepochのscoreを返す
        # return eval_model.topn_precision(model)
        #return eval_model.topn_map(model)
        return eval_model.topn_map(model)
Example #6
0
def analysis():

    corpus_list = data_structure.xml_dir_to_corpus_list(all_xml_dir)
    '''
	for corpus in corpus_list:
		for span in corpus.edu_spans:
			print corpus.text[span[0]:span[-1]+1].encode('utf-8')
	'''
    with open(train_corpus_list_file, "rb") as myFile:
        corpus_list = pickle.load(myFile)
    with open(test_corpus_list_file, "rb") as myFile:
        test_corpus_list = pickle.load(myFile)
    evaluater = evaluate.Evaluater()
    evaluater.show_relation_distribution_from_corpus_list(corpus_list)
Example #7
0
    def iterate_epoch(self,
                      model,
                      lr,
                      epoch,
                      optimizer='Adam',
                      weight_decay=0,
                      warmup=0,
                      lr_decay_rate=1,
                      lr_decay_every=10,
                      eval_every=5):
        eval_model = evaluate.Evaluater(self.data_dir)
        plot_loss_list = []
        plot_score_list = []

        for i in range(epoch):
            plot_loss_list.extend(
                self.iterate_train(model,
                                   lr=lr,
                                   optimizer=optimizer,
                                   weight_decay=weight_decay,
                                   print_every=500))

            # lrスケジューリング
            if i > warmup:
                if (i - warmup) % lr_decay_every == 0:
                    lr = lr * lr_decay_rate

            if (i + 1) % eval_every == 0:
                #score = eval_model.topn_precision(model)
                score = eval_model.topn_map(model)
                plot_score_list.append(score)
                #print('epoch: {}  precision: {}'.format(i, score))
                print('epoch: {}  map: {}'.format(i, score))

        #self._plot(plot_loss_list)
        #self._plot(plot_score_list)

        # とりあえず最後のepochのscoreを返す
        return eval_model.topn_map(model)
def test_from_corpus_list(model, corpus_list):
    evaluater = evaluate.Evaluater()
    evaluater2 = evaluate.Evaluater()
    evaluater3 = evaluate.Evaluater()
    count = 0
    print 'vocab_size', len(data_structure.word_to_ix)
    for corpus in corpus_list:

        #if count == int(sys.argv[1]):
        #	continue
        #print corpus.id
        #count += 1
        test_instance = data_structure.corpus_to_test_instance(corpus,
                                                               binary=True)
        # multinuclear gold instnace
        gold_multi_instance = data_structure.corpus_to_test_instance(
            corpus, binary=False)
        gold_binary_instance = data_structure.corpus_to_test_instance(
            corpus, binary=True)
        #print test_instance.fragments

        test_instance.fragments = text_to_nn_word_list(test_instance.fragments)
        model.zero_grad()
        #print 'test_instance'
        #data_structure.print_test_instance(test_instance)
        if lstm_crf_only_flag:
            _, labels = model(test_instance.fragments)
            #return_instance = data_structure.labels_to_words_in_test_instance(labels, instance)

        else:
            return_instance = model(test_instance)
        #print 'corpus',
        #data_structure.print_corpus(corpus)
        return_instance.i_relations = data_structure.relations_to_post_order(
            return_instance.i_relations)
        '''
		print 'return_instance'
		data_structure.print_test_instance(return_instance)
		print 'gold_binary_instance'
		data_structure.print_test_instance(gold_binary_instance)
		print 'gold_multi_instance'
		data_structure.print_test_instance(gold_multi_instance)
		'''

        evaluater_tmp1 = evaluate.Evaluater()
        evaluater_tmp1.collect_eval_data(gold_binary_instance, return_instance)
        evaluater_tmp1.show_single_eval_result(gold_binary_instance,
                                               return_instance)

        evaluater.collect_eval_data(gold_binary_instance, return_instance)
        evaluater2.collect_eval_data(gold_multi_instance, return_instance)
        return_instance.du_i_relations = \
         data_structure.relations_binary_to_multi_preorder(return_instance.du_i_relations)
        evaluater3.collect_eval_data(gold_multi_instance, return_instance)

        evaluater_tmp2 = evaluate.Evaluater()
        evaluater_tmp2.collect_eval_data(gold_multi_instance, return_instance)
        evaluater_tmp2.show_single_eval_result(gold_multi_instance,
                                               return_instance)

    evaluater.show_eval_result()
    evaluater2.show_eval_result()
    evaluater3.show_eval_result()
    return
Example #9
0
        for l in f:
            user_list.append(l.replace('\n', ''))

    with open(data_dir + 'item_list.txt', 'r') as f:
        for l in f:
            item_list.append(l.replace('\n', ''))

    lin_model = 'elastic'
    model = SLIM_model.SLIM(alpha,
                            l1_ratio,
                            len(user_list),
                            len(item_list),
                            lin_model=lin_model)
    model.fit_multi(user_item_train_df)
    #model.load_sim_mat('./sim_mat.txt', user_item_train_df)
    eval_model = evaluate.Evaluater(user_item_test_df, len(user_list))
    model.predict()
    score_sum = 0
    not_count = 0
    for i in range(len(user_list)):
        rec_item_idx = model.pred_ranking(i)
        #score = eval_model.topn_precision(rec_item_idx, i)
        score = eval_model.topn_map(rec_item_idx, i)
        if score > 1:
            not_count += 1
            continue
        score_sum += score

    score = -1 * (score_sum / (len(user_list) - not_count))
    print(score)
    np.savetxt('score.txt', np.array([score]))
Example #10
0
            entity_list.append(l.replace('\n', ''))
            
    with open(data_path + '/user_list.txt', 'r') as f:
        for l in f:
            user_list.append(l.replace('\n', ''))
            
    with open(data_path + '/item_list.txt', 'r') as f:
        for l in f:
            item_list.append(l.replace('\n', ''))
            

    user_idx = [entity_list.index(u) for u in user_list]

    dataset = dataloader.AmazonDataset(data_path)
    # グラフを作る
    # user-itemとitem-userどちらの辺も追加
    edges = [[r[0], r[1]] for r in dataset.triplet_df.values]
    for r in dataset.triplet_df.values:
        if r[2] == 0:
            edges.append([r[1], r[0]])
        
    # load network
    G = nx.DiGraph()
    G.add_nodes_from([i for i in range(len(dataset.entity_list))])
    G.add_edges_from(edges)

    alpha = 0.85
    ranking_mat = get_ranking_mat(alpha)
    evaluater = evaluate.Evaluater(data_path)
    score = evaluater.topn_map(ranking_mat)
    np.savetxt('score.txt', np.array([score]))
Example #11
0
import dataloader
import evaluate
import model
import training
from influence import get_influence

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import optuna

data_dir = './data'
dataset = dataloader.AmazonDataset(data_dir)
evaluater = evaluate.Evaluater(data_dir)

# 学習済みモデル読み込み
mf = torch.load('model.torch')
# loss_func定義
loss_func = nn.BCELoss()

# 任意のテストデータ[u_i, i_i]ロード
# あるユーザに対してランキング上位のアイテム
target_user = dataset.user_list[0]
target_user = dataset.user_list.index(target_user)
ranking_idx = evaluater.predict(mf, target_user)
target_item = ranking_idx[0]

# trainingデータのinfluenceを計算
## [u_i, i_i]のどちらかを含むtrainingデータを持ってくる