コード例 #1
0
    '''Test Process for Metrics Exporting'''
    # df, user_num, item_num = load_rate(args.dataset, args.prepro, binary=False)
    # train_set, test_set = split_test(df, args.test_method, args.test_size)

    # temporary used for tuning test result
    train_set = pd.read_csv(f'./experiment_data/train_{args.dataset}_{args.prepro}_{args.test_method}.dat')
    test_set = pd.read_csv(f'./experiment_data/test_{args.dataset}_{args.prepro}_{args.test_method}.dat')
    if args.dataset in ['yelp']:
        train_set['timestamp'] = pd.to_datetime(train_set['timestamp'])
        test_set['timestamp'] = pd.to_datetime(test_set['timestamp'])
    df = pd.concat([train_set, test_set], ignore_index=True)
    user_num = df['user'].nunique()
    item_num = df['item'].nunique()
    
    # get ground truth
    test_ur = get_ur(test_set)
    total_train_ur = get_ur(train_set)

    # initial candidate item pool
    item_pool = set(range(item_num))
    candidates_num = args.cand_num

    print('='*50, '\n')
    # retrain model by the whole train set
    # build recommender model
    # model = SLIM(user_num, item_num, alpha=args.alpha, lam_bda=args.elastic, 
    #              max_iter=args.epochs, tol=args.tol)
    model = SLIM(user_num, item_num, l1_ratio=args.elastic, alpha=args.alpha)
    model.fit(train_set)

    print('Start Calculating Metrics......')
コード例 #2
0
def opt_func(params, mi=args.sc_met, topk=args.topk):
    factors = int(params['factors'])
    print(f'Parameter Settings: factors:{factors}')

    # store metrics result for final validation set
    fnl_metric = []
    for fold in range(fn):
        print(f'Start Validation [{fold + 1}]......')
        train = train_set_list[fold]
        validation = val_set_list[fold]

        # get ground truth
        train_ur = get_ur(train)
        val_ur = get_ur(validation)

        # build recommender model
        model = PureSVD(user_num, item_num, factors)
        model.fit(train)

        # build candidates set
        val_ucands = defaultdict(list)
        for k, v in val_ur.items():
            sample_num = candidates_num - len(v) if len(
                v) < candidates_num else 0
            sub_item_pool = item_pool - v - train_ur[
                k]  # remove GT & interacted
            sample_num = min(len(sub_item_pool), sample_num)
            samples = random.sample(sub_item_pool, sample_num)
            val_ucands[k] = list(v | set(samples))

        # get predict result
        print('')
        print('Generate recommend list...')
        print('')
        preds = {}
        for u in tqdm(val_ucands.keys()):
            pred_rates = [model.predict(u, i) for i in val_ucands[u]]
            rec_idx = np.argsort(pred_rates)[::-1][:topk]
            top_n = np.array(val_ucands[u])[rec_idx]
            preds[u] = top_n

        # convert rank list to binary-interaction
        for u in preds.keys():
            preds[u] = [1 if i in val_ur[u] else 0 for i in preds[u]]

        # calculate metrics for validation set
        pre_k = np.mean([precision_at_k(r, topk) for r in preds.values()])
        rec_k = recall_at_k(preds, val_ur, topk)
        hr_k = hr_at_k(preds, val_ur)
        map_k = map_at_k(preds.values())
        mrr_k = mrr_at_k(preds, topk)
        ndcg_k = np.mean([ndcg_at_k(r, topk) for r in preds.values()])

        tmp_metric = np.array([pre_k, rec_k, hr_k, map_k, mrr_k, ndcg_k])
        fnl_metric.append(tmp_metric)

    # get final validation metrics result by average operation
    fnl_metric = np.array(fnl_metric).mean(axis=0)
    print('=' * 20, 'Metrics for All Validation', '=' * 20)
    print(f'Precision@{topk}: {fnl_metric[0]:.4f}')
    print(f'Recall@{topk}: {fnl_metric[1]:.4f}')
    print(f'HR@{topk}: {fnl_metric[2]:.4f}')
    print(f'MAP@{topk}: {fnl_metric[3]:.4f}')
    print(f'MRR@{topk}: {fnl_metric[4]:.4f}')
    print(f'NDCG@{topk}: {fnl_metric[5]:.4f}')

    score = fnl_metric[metric_idx[mi]]

    # record all tuning result and settings
    fnl_metric = [f'{mt:.4f}' for mt in fnl_metric]
    line = ','.join(fnl_metric) + f',{factors}' + '\n'

    f.write(line)
    f.flush()

    return -score
コード例 #3
0
def opt_func(params, mi=args.sc_met, topk=args.topk):
    num_ng, hidden_factor, lr, lamda = int(params['num_ng']), int(
        params['hidden_factor']), params['lr'], params['lamda']
    print(
        f'Parameter Settings: num_ng:{num_ng}, hidden_factor:{hidden_factor}, lr:{lr}, lamda:{lamda}'
    )

    # store metrics result for final validation set
    fnl_metric = []
    for fold in range(fn):
        print(f'Start Validation [{fold + 1}]......')
        train = train_set_list[fold]
        validation = val_set_list[fold]

        # get ground truth
        train_ur = get_ur(train)
        val_ur = get_ur(validation)

        # format training data
        train_dataset = PairFMData(train, feat_idx_dict, item_num, num_ng,
                                   True)
        print('Finish construct FM torch-dataset......')
        train_loader = data.DataLoader(train_dataset,
                                       drop_last=True,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=4)

        # build recommender model
        model = PairFM(num_features, hidden_factor, args.batch_norm,
                       eval(args.dropout), args.epochs, lr, lamda, args.gpu,
                       args.loss_type)
        model.fit(train_loader)

        # build candidates set
        val_ucands = defaultdict(list)
        for k, v in val_ur.items():
            sample_num = candidates_num - len(v) if len(
                v) < candidates_num else 0
            sub_item_pool = item_pool - v - train_ur[
                k]  # remove GT & interacted
            sample_num = min(len(sub_item_pool), sample_num)
            samples = random.sample(sub_item_pool, sample_num)
            val_ucands[k] = list(v | set(samples))

        # get predict result
        print('')
        print('Generate recommend list...')
        print('')
        preds = {}
        for u in tqdm(val_ucands.keys()):
            # build a validation FM dataset for certain user u
            tmp = pd.DataFrame({
                'user': [u for _ in val_ucands[u]],
                'item': val_ucands[u],
                'rating':
                [0. for _ in val_ucands[u]],  # fake label, make nonsense
            })
            tmp_dataset = PairFMData(tmp, feat_idx_dict, item_num, 0, False)
            tmp_loader = data.DataLoader(tmp_dataset,
                                         batch_size=candidates_num,
                                         shuffle=False,
                                         num_workers=0)
            # get top-N list with torch method
            for feat_i, feat_val_i, feat_j, feat_val_j, _ in tmp_loader:
                if torch.cuda.is_available():
                    feat_i = feat_i.cuda()
                    feat_val_i = feat_val_i.cuda()
                else:
                    feat_i = feat_i.cpu()
                    feat_val_i = feat_val_i.cpu()

                prediction = model.predict(feat_i, feat_val_i)
                prediction = prediction.clamp(min=-1.0, max=1.0)
                _, indices = torch.topk(prediction, topk)
                top_n = torch.take(torch.tensor(val_ucands[u]),
                                   indices).cpu().numpy()

            preds[u] = top_n

        # convert rank list to binary-interaction
        for u in preds.keys():
            preds[u] = [1 if i in val_ur[u] else 0 for i in preds[u]]

        # calculate metrics for validation set
        pre_k = np.mean([precision_at_k(r, topk) for r in preds.values()])
        rec_k = recall_at_k(preds, val_ur, topk)
        hr_k = hr_at_k(preds, val_ur)
        map_k = map_at_k(preds.values())
        mrr_k = mrr_at_k(preds, topk)
        ndcg_k = np.mean([ndcg_at_k(r, topk) for r in preds.values()])

        tmp_metric = np.array([pre_k, rec_k, hr_k, map_k, mrr_k, ndcg_k])
        fnl_metric.append(tmp_metric)

    # get final validation metrics result by average operation
    fnl_metric = np.array(fnl_metric).mean(axis=0)
    print('=' * 20, 'Metrics for All Validation', '=' * 20)
    print(f'Precision@{topk}: {fnl_metric[0]:.4f}')
    print(f'Recall@{topk}: {fnl_metric[1]:.4f}')
    print(f'HR@{topk}: {fnl_metric[2]:.4f}')
    print(f'MAP@{topk}: {fnl_metric[3]:.4f}')
    print(f'MRR@{topk}: {fnl_metric[4]:.4f}')
    print(f'NDCG@{topk}: {fnl_metric[5]:.4f}')

    score = fnl_metric[metric_idx[mi]]

    # record all tuning result and settings
    fnl_metric = [f'{mt:.4f}' for mt in fnl_metric]
    line = ','.join(
        fnl_metric) + f',{num_ng},{hidden_factor},{lr},{lamda}' + '\n'

    f.write(line)
    f.flush()

    return -score
コード例 #4
0
)
test_set = pd.read_csv(
    f'./experiment_data/test_{args.dataset}_{args.prepro}_{args.test_method}.dat'
)
if args.dataset in ['yelp']:
    train_set['timestamp'] = pd.to_datetime(train_set['timestamp'])
    test_set['timestamp'] = pd.to_datetime(test_set['timestamp'])

train_set['rating'] = 1.0
test_set['rating'] = 1.0
df = pd.concat([train_set, test_set], ignore_index=True)
user_num = df['user'].nunique()
item_num = df['item'].nunique()

# get ground truth
test_ur = get_ur(test_set)
total_train_ur = get_ur(train_set)

train_set_list, val_set_list, fn = split_validation(train_set, args.val_method,
                                                    args.fold_num)

# initial candidate item pool
item_pool = set(range(item_num))
candidates_num = args.cand_num

# store metrics result for final validation set
fnl_metric = []
for fold in range(fn):
    print(f'Start Validation [{fold + 1}]......')
    train = train_set_list[fold]
    validation = val_set_list[fold]
コード例 #5
0
def opt_func(params, mi=args.sc_met, topk=args.topk):
    sim_method, maxk = params[0], int(params[1])
    print(f'Parameter Settings: sim_method:{sim_method}, maxk: {maxk}')

    # store metrics result for test set
    fnl_metric = []
    for fold in range(fn):
        print(f'Start Validation [{fold + 1}]......')
        train = train_set_list[fold]
        validation = val_set_list[fold]

        # get ground truth
        train_ur = get_ur(train)
        val_ur = get_ur(validation)

        # build recommender model
        model = ItemKNNCF(
            user_num,
            item_num,
            maxk=args.maxk,
            min_k=args.mink,
            similarity=args.sim_method,
            tune_or_not=True,
            serial=
            f'{args.dataset}-{args.prepro}-{args.val_method}-{fold}-{sim_method}'
        )
        model.fit(train)

        # build candidates set
        val_ucands = defaultdict(list)
        for k, v in val_ur.items():
            sample_num = candidates_num - len(v) if len(
                v) < candidates_num else 0
            sub_item_pool = item_pool - v - train_ur[
                k]  # remove GT & interacted
            sample_num = min(len(sub_item_pool), sample_num)
            samples = random.sample(sub_item_pool, sample_num)
            val_ucands[k] = list(v | set(samples))

        # get predict result
        # preds = {}
        # for u in tqdm(val_ucands.keys()):
        #     pred_rates = [model.predict(u, i) for i in val_ucands[u]]
        #     rec_idx = np.argsort(pred_rates)[::-1][:topk]
        #     top_n = np.array(val_ucands[u])[rec_idx]
        #     preds[u] = top_n
        cores = 32
        pool = ThreadPoolExecutor(cores)

        preds = {}
        ct = 0

        def func(u):
            pred_rates = [model.predict(u, i) for i in val_ucands[u]]
            rec_idx = np.argsort(pred_rates)[::-1][:topk]
            top_n = np.array(val_ucands[u])[rec_idx]
            preds[u] = top_n
            return 1

        for u in tqdm(val_ucands.keys()):
            c_r = pool.submit(func, u)
            ct += c_r.result()

        # convert rank list to binary-interaction
        for u in preds.keys():
            preds[u] = [1 if i in val_ur[u] else 0 for i in preds[u]]

        # calculate metrics for validation set
        pre_k = np.mean([precision_at_k(r, topk) for r in preds.values()])
        rec_k = recall_at_k(preds, val_ur, topk)
        hr_k = hr_at_k(preds, val_ur)
        map_k = map_at_k(preds.values())
        mrr_k = mrr_at_k(preds, topk)
        ndcg_k = np.mean([ndcg_at_k(r, topk) for r in preds.values()])

        tmp_metric = np.array([pre_k, rec_k, hr_k, map_k, mrr_k, ndcg_k])
        fnl_metric.append(tmp_metric)

    # get final validation metrics result by average operation
    fnl_metric = np.array(fnl_metric).mean(axis=0)
    print('=' * 20, 'Metrics for All Validation', '=' * 20)
    print(f'Precision@{topk}: {fnl_metric[0]:.4f}')
    print(f'Recall@{topk}: {fnl_metric[1]:.4f}')
    print(f'HR@{topk}: {fnl_metric[2]:.4f}')
    print(f'MAP@{topk}: {fnl_metric[3]:.4f}')
    print(f'MRR@{topk}: {fnl_metric[4]:.4f}')
    print(f'NDCG@{topk}: {fnl_metric[5]:.4f}')

    score = fnl_metric[metric_idx[mi]]

    # record all tuning result and settings
    fnl_metric = [f'{mt:.4f}' for mt in fnl_metric]
    line = ','.join(fnl_metric) + f',{sim_method},{maxk}' + '\n'

    f.write(line)
    f.flush()

    return -score
コード例 #6
0
def opt_func(params, mi=args.sc_met, topk=args.topk):
    num_ng, factor_num, num_layers = int(params['num_ng']), int(params['factor_num']), int(params['num_layers'])
    dropout, lr, batch_size, lamda = params['dropout'], params['lr'], params['batch_size'], params['lamda']
    print(f'Parameter Settings: num_ng:{num_ng},factors:{factor_num},layers:{num_layers},dropout:{dropout},lr:{lr},batch_size:{batch_size},lamda:{lamda}')

    # store metrics result for final validation set
    fnl_metric = []
    for fold in range(fn):
        print(f'Start Validation [{fold + 1}]......')
        train = train_set_list[fold]
        validation = val_set_list[fold]

        # get ground truth
        train_ur = get_ur(train)
        val_ur = get_ur(validation)

        # start negative sampling
        train_sampled = negative_sampling(user_num, item_num, train, num_ng)
        # format training data
        train_dataset = PointMFData(train_sampled)
        train_loader = data.DataLoader(train_dataset, batch_size=batch_size, 
                                    shuffle=True, num_workers=4)

        # whether load pre-train model
        model_name = args.model_name
        assert model_name in ['MLP', 'GMF', 'NeuMF-end', 'NeuMF-pre']
        GMF_model_path = f'./tmp/{args.dataset}/CL/GMF.pt'
        MLP_model_path = f'./tmp/{args.dataset}/CL/MLP.pt'
        NeuMF_model_path = f'./tmp/{args.dataset}/CL/NeuMF.pt'

        if model_name == 'NeuMF-pre':
            assert os.path.exists(GMF_model_path), 'lack of GMF model'    
            assert os.path.exists(MLP_model_path), 'lack of MLP model'
            GMF_model = torch.load(GMF_model_path)
            MLP_model = torch.load(MLP_model_path)
        else:
            GMF_model = None
            MLP_model = None

        # build recommender model
        model = PointNeuMF(user_num, item_num, factor_num, num_layers, dropout, 
                            lr, args.epochs, lamda, args.model_name, GMF_model, MLP_model,
                            args.gpu, args.loss_type)
        model.fit(train_loader)

        # build candidates set
        val_ucands = defaultdict(list)
        for k, v in val_ur.items():
            sample_num = candidates_num - len(v) if len(v) < candidates_num else 0
            sub_item_pool = item_pool - v - train_ur[k] # remove GT & interacted
            sample_num = min(len(sub_item_pool), sample_num)
            samples = random.sample(sub_item_pool, sample_num)
            val_ucands[k] = list(v | set(samples))
        
        # get predict result
        print('')
        print('Generate recommend list...')
        print('')
        preds = {}
        for u in tqdm(val_ucands.keys()):
            # build a validation MF dataset for certain user u
            tmp = pd.DataFrame({'user': [u for _ in val_ucands[u]], 
                                'item': val_ucands[u], 
                                'rating': [0. for _ in val_ucands[u]], # fake label, make nonsense
                                })
            tmp_dataset = PointMFData(tmp)
            tmp_loader = data.DataLoader(tmp_dataset, batch_size=candidates_num, 
                                        shuffle=False, num_workers=0)

            # get top-N list with torch method 
            for user_u, item_i, _ in tmp_loader:
                if torch.cuda.is_available():
                    user_u = user_u.cuda()
                    item_i = item_i.cuda()
                else:
                    user_u = user_u.cpu()
                    item_i = item_i.cpu()

                prediction = model.predict(user_u, item_i)
                _, indices = torch.topk(prediction, topk)
                top_n = torch.take(torch.tensor(val_ucands[u]), indices).cpu().numpy()

            preds[u] = top_n

        # convert rank list to binary-interaction
        for u in preds.keys():
            preds[u] = [1 if i in val_ur[u] else 0 for i in preds[u]]

        # calculate metrics for validation set
        pre_k = np.mean([precision_at_k(r, topk) for r in preds.values()])
        rec_k = recall_at_k(preds, val_ur, topk)
        hr_k = hr_at_k(preds, val_ur)
        map_k = map_at_k(preds.values())
        mrr_k = mrr_at_k(preds, topk)
        ndcg_k = np.mean([ndcg_at_k(r, topk) for r in preds.values()])

        tmp_metric = np.array([pre_k, rec_k, hr_k, map_k, mrr_k, ndcg_k])
        fnl_metric.append(tmp_metric)

    # get final validation metrics result by average operation
    fnl_metric = np.array(fnl_metric).mean(axis=0)
    print('='*20, 'Metrics for All Validation', '='*20)
    print(f'Precision@{topk}: {fnl_metric[0]:.4f}')
    print(f'Recall@{topk}: {fnl_metric[1]:.4f}')
    print(f'HR@{topk}: {fnl_metric[2]:.4f}')
    print(f'MAP@{topk}: {fnl_metric[3]:.4f}')
    print(f'MRR@{topk}: {fnl_metric[4]:.4f}')
    print(f'NDCG@{topk}: {fnl_metric[5]:.4f}')

    score = fnl_metric[metric_idx[mi]]

    # record all tuning result and settings
    fnl_metric = [f'{mt:.4f}' for mt in fnl_metric]
    line = ','.join(fnl_metric) + f',{num_ng},{factor_num},{num_layers},{dropout},{lr},{batch_size},{lamda}' + '\n'

    f.write(line)
    f.flush()

    return -score