Example #1
0
def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration):
    '''
    One iteration of training and testing. The experimental ID 
    '''
    
    # define mcpl_log style. 
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration);
    sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. 
    
    # check resource for existing model.  
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder);
    if not trained_model:
        
        # train model using the training data. 
        # NOTE: this is the most time-consuming part. 
        mcpl_log('training models...');
        method.train(tr_data);
        
        # save resource
        trained_model = [method];
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder);
    
    # compute performance on test data using the model.    
    [method] = trained_model;
    mcpl_log('computing evaluation metrics on the test data...');
    eval_result = rmse(te_data.data_val, method.predict(te_data.data_row, te_data.data_col));
    
    return eval_result;
    
    
    
    
Example #2
0
def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration):
    '''
    One iteration of training and testing. The experimental ID 
    '''

    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
                                     sub_folder)
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        mcpl_log('training models...')
        method.train(tr_data)

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,
                         sub_folder)

    # compute performance on test data using the model.
    [method] = trained_model
    mcpl_log('computing evaluation metrics on the test data...')
    eval_result = rmse(te_data.data_val,
                       method.predict(te_data.data_row, te_data.data_col))

    return eval_result
Example #3
0
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration,
                                top_n):
    '''
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 
    '''

    # define lko_log style.
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
                                     sub_folder)
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        lko_log('training models...')
        method.train(data_tr)

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,
                         sub_folder)

    # compute performance on test data using the model.
    [method] = trained_model
    lko_log('computing evaluation metrics on the test data...')

    eval_result = {}
    # ranked list.

    col_num = data_left.num_col
    pred_col = range(col_num)

    tr_data_csr = data_tr.get_sparse_matrix().tocsr()
    lo_data_csr = data_left.get_sparse_matrix().tocsr()

    for user_idx in range(data_left.num_row):
        # predict the entire row.

        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col)

        # rank the column (the result is a list of indices).
        srt_col = [
            k[0] for k in sorted(
                enumerate(row_pred), key=lambda x: x[1], reverse=True)
        ]
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist())
        # remove the trained column.
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]
        # top - k (safeguard)
        te_topk_col = te_srt_col[:min(top_n,
                                      len(te_srt_col) - 1)]
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist())

        prec = precision_itemlist(te_topk_col, lo_col)
        rec = recall_itemlist(te_topk_col, lo_col)
    eval_result['prec'] = prec
    eval_result['recall'] = rec
    eval_result['rmse'] = rmse(
        data_left.data_val,
        method.predict(data_left.data_row, data_left.data_col))
    return eval_result
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration, top_n):
    '''
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 
    '''
    
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration);
    sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. 
    
    # check resource for existing model.  
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder);
    if not trained_model:
        
        # train model using the training data. 
        # NOTE: this is the most time-consuming part. 
        lko_log('training models...');
        method.train(data_tr);
        
        # save resource
        trained_model = [method];
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder);
    
    # compute performance on test data using the model.    
    [method] = trained_model;
    lko_log('computing evaluation metrics on the test data...');
    
    eval_result = {};
    # ranked list.
    
    col_num  = data_left.num_col;
    pred_col = range(col_num);
    
    tr_data_csr = data_tr.get_sparse_matrix().tocsr();
    lo_data_csr = data_left.get_sparse_matrix().tocsr();
    
    for user_idx in range(data_left.num_row): 
        # predict the entire row. 
        
        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col);
        
        # rank the column (the result is a list of indices).
        srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)];
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist());
        # remove the trained column.
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col];
        # top - k (safeguard)
        te_topk_col = te_srt_col[:min(top_n, len(te_srt_col)-1)]; 
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist());
        
        prec = precision_itemlist (te_topk_col, lo_col);
        rec  = recall_itemlist    (te_topk_col, lo_col); 
    eval_result['prec']   = prec;
    eval_result['recall'] = rec;
    eval_result['rmse']   = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col));
    return eval_result;
Example #5
0
def experiment_unit_leave_k_out_map(exp_id, method, data_tr, data_left, iteration, max_rank):
    '''
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 
    '''
    
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration);
    sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. 
    
    # check resource for existing model.  
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder);
    if not trained_model:
        
        # train model using the training data. 
        # NOTE: this is the most time-consuming part. 
        lko_log('training models...');
        method.train(data_tr);
        
        # save resource
        trained_model = [method];
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder);
    
    # compute performance on test data using the model.    
    [method] = trained_model;
    lko_log('computing evaluation metrics on the test data...');
    
    eval_result = {};
    # ranked list.
    
    col_num  = data_left.num_col;
    pred_col = range(col_num);
    
    tr_data_csr = data_tr.get_sparse_matrix().tocsr();
    lo_data_csr = data_left.get_sparse_matrix().tocsr();
    
    perf_vect_prec = np.zeros(max_rank); # precision 
    perf_vect_rec  = np.zeros(max_rank); # recall 
    perf_vect_hr   = np.zeros(max_rank); # hit rate (Modification of Xia Ning's Paper) 
    
    for user_idx in range(data_left.num_row): 
        # predict the entire row. 
        
        
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist());
        
        # there is no testing on this user. 
        if len(lo_col) == 0:
            continue;
        
        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col);
        
        # rank the column (the result is a list of indices).
        srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)];
        
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist());
        
        
        
        # remove the trained column from prediction. 
        # this contains a set of indices that predicted (excluding training items).
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col];
        
        #max_rank will result in an array of 0:max_rank-1;
        
        hit = 0; # the hit variable keeps track of the number of hits till the current rank. 
        
        for rk in range(max_rank):
            # if rk is greater than the length of te_srt_col, then continue;
            # if not, detect possible hits.
            #    a hit is defined by items hits  
            if (rk < len(te_srt_col)) and (te_srt_col[rk] in lo_col):
                hit += 1;
            
            perf_vect_hr[rk]   += float(hit)/len(lo_col); # hit rate
            perf_vect_prec[rk] += float(hit)/(rk+1);          # precision
            perf_vect_rec[rk]  += float(hit)/len(lo_col); # recall

    #normalization over users.
    perf_vect_hr   = perf_vect_hr/data_left.num_row; 
    perf_vect_prec = perf_vect_prec/data_left.num_row;
    perf_vect_rec  = perf_vect_rec/data_left.num_row;
         
    eval_result['hit_rate']  = perf_vect_hr;
    eval_result['precision'] = perf_vect_prec; 
    eval_result['recall']    = perf_vect_rec; 
    eval_result['RMSE']      = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col));
    return eval_result;