Esempio n. 1
0
def main(args):
    progress = WorkSplitter()

    raw = pd.read_csv(args.path + args.name,
                      names=['user', 'item', 'rating', 'timestamp'])

    raw['userID'] = pd.factorize(raw.user)[0]
    raw['itemID'] = pd.factorize(raw.item)[0]

    progress.section("Load Raw Data")
    rating_matrix = getSparseMatrix(raw,
                                    row_name='userID',
                                    col_name='itemID',
                                    value_name='rating')
    timestamp_matrix = getSparseMatrix(raw,
                                       row_name='userID',
                                       col_name='itemID',
                                       value_name='timestamp')

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 2
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.data_dir))
    print("Implicit User Feedback: {}".format(args.implicit))

    progress.section("Load Raw Data")
    rating_matrix, timestamp_matrix = get_yelp_df(
        args.data_dir + args.data_name,
        sampling=True,
        top_user_num=args.top_user_num,
        top_item_num=args.top_item_num)

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    import ipdb
    ipdb.set_trace()

    progress.section("Save NPZ")
    save_numpy(rtrain, args.data_dir, "Rtrain")
    save_numpy(rvalid, args.data_dir, "Rvalid")
    save_numpy(rtest, args.data_dir, "Rtest")
    save_numpy(rtime, args.data_dir, "Rtime")
    save_array(nonzero_index, args.data_dir, "Index")
Esempio n. 3
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(row_name='userId',
                                col_name='itemId',
                                value_name=None,
                                path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(row_name='userId',
                                   col_name='itemId',
                                   value_name='Timestamp',
                                   path=args.path,
                                   name=args.name,
                                   shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 4
0
def main(args):
    progress = WorkSplitter()

    progress.section("Yahoo R3: Load Raw Data")
    user_df = pd.read_csv(args.path + args.dataset + args.user,
                          sep=args.sep,
                          header=None,
                          names=args.names)
    random_df = pd.read_csv(args.path + args.dataset + args.random,
                            sep=args.sep,
                            header=None,
                            names=args.names)

    if args.implicit:
        """
        If only implicit (clicks, views, binary) feedback, convert to implicit feedback
        """
        user_df['rating'].loc[user_df['rating'] < args.threshold] = -1
        user_df['rating'].loc[user_df['rating'] >= args.threshold] = 1

        random_df['rating'].loc[random_df['rating'] < args.threshold] = -1
        random_df['rating'].loc[random_df['rating'] >= args.threshold] = 1

    progress.section("Yahoo R3: Randomly Split Random Set")
    m, n = max(user_df['uid']) + 1, max(user_df['iid']) + 1
    unif_train, validation, test = seed_randomly_split(df=random_df,
                                                       ratio=args.ratio,
                                                       split_seed=args.seed,
                                                       shape=(m, n))

    progress.section("Yahoo R3: Save NPZ")
    save_dir = args.path + args.dataset
    train = sparse.csr_matrix(
        (user_df['rating'], (user_df['uid'], user_df['iid'])),
        shape=(m, n),
        dtype='float32')
    save_numpy(train, save_dir, "S_c")
    save_numpy(unif_train, save_dir, "S_t")
    save_numpy(validation, save_dir, "S_va")
    save_numpy(test, save_dir, "S_te")

    progress.section("Yahoo R3: Statistics of Data Sets")
    print('* S_c  #num: %6d, pos: %.6f, neg: %.6f' %
          (train.count_nonzero(), np.sum(train == 1) / train.count_nonzero(),
           1 - np.sum(train == 1) / train.count_nonzero()))
    print('* S_t  #num: %6d, pos: %.6f, neg: %.6f' %
          (unif_train.count_nonzero(),
           np.sum(unif_train == 1) / unif_train.count_nonzero(),
           1 - np.sum(unif_train == 1) / unif_train.count_nonzero()))
    print('* S_va #num: %6d, pos: %.6f, neg: %.6f' %
          (validation.count_nonzero(),
           np.sum(validation == 1) / validation.count_nonzero(),
           1 - np.sum(validation == 1) / validation.count_nonzero()))
    print('* S_te #num: %6d, pos: %.6f, neg: %.6f' %
          (test.count_nonzero(), np.sum(test == 1) / test.count_nonzero(),
           1 - np.sum(test == 1) / test.count_nonzero()))
Esempio n. 5
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.path))
    print("Validation: {}".format(args.validation))
    print("Implicit: {}".format(args.implicit))

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(path=args.path,
                                   value_name='timestamp',
                                   name=args.name,
                                   shape=args.shape)

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, _, _, rtime = split_user_randomly(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.split_user_ratio,
        implicit=args.implicit)

    if args.validation:
        rtrain, rvalid, _, _, _ = time_ordered_split(
            rating_matrix=rtrain,
            timestamp_matrix=rtime,
            ratio=args.split_train_valid_ratio,
            implicit=False,
            remove_empty=False)

    ractive, rtest, _, _, _ = time_ordered_split(
        rating_matrix=rtest,
        timestamp_matrix=rtime,
        ratio=args.split_active_test_ratio,
        implicit=False,
        remove_empty=False)

    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(ractive, args.path, "Ractive")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
def main(args):
    progress = WorkSplitter()
    progress.section("Load Raw Data")
    rating_matrix = load_pandas_without_names(
        path=args.path,
        name=args.name,
        row_name='userId',
        sep='\t',
        col_name='trackId',
        value_name='rating',
        shape=args.shape,
        names=['userId', 'trackId', 'rating'])
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index = split_seed_randomly(
        rating_matrix=rating_matrix,
        ratio=args.ratio,
        threshold=80,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    print("Done splitting Yahoo dataset")
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_array(nonzero_index, args.path, "Index")
    print("Done saving data for yahoo after splitting")
Esempio n. 7
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix, timestamp_matrix = load_netflix(path=args.folder, shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(rating_matrix=rating_matrix,
                                                                     timestamp_matrix=timestamp_matrix,
                                                                     ratio=args.ratio, implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 8
0
def main(args):
    progress = WorkSplitter()
    progress.section("Load Raw Data")
    #rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape)
    rating_matrix = load_yahoo(path=args.path,
                               name=args.name,
                               shape=args.shape)
    #timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape)
    progress.section("Split CSR Matrices")
    #rtrain, rvalid, rtest, nonzero_index = time_ordered_split(rating_matrix=rating_matrix,                                                          ratio=args.ratio, implicit=args.implicit)
    rtrain, rvalid, rtest, nonzero_index = split_seed_randomly(
        rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit)
    print("Done splitting Yahoo dataset")
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_array(nonzero_index, args.path, "Index")
    print("Done saving data for yahoo after splitting")
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.data_dir))
    reviewJsonToronto = args.data_dir + args.data_name

    progress.section("Load data")
    df = get_yelp_df(path='', filename=reviewJsonToronto, sampling=True)
    print('Data loaded sucessfully')

    progress.section("Matrix Generation")
    rating_matrix, timestamp_matrix, I_C_matrix, IC_dictionary = get_rating_timestamp_matrix(
        df)
    # get ratingWuserAvg_matrix
    rating_array = rating_matrix.toarray()
    user_average_array = rating_array.sum(axis=1) / np.count_nonzero(
        rating_array, axis=1)
    init_UI = np.zeros(rating_array.shape)
    init_UI[rating_array.nonzero()] = 1

    #Creating rating with user average array array
    for i in range(user_average_array.shape[0]):
        init_UI[i] = init_UI[i] * (user_average_array[i] - 0.001)
    user_average_array = init_UI
    ratingWuserAvg_array = rating_array - user_average_array
    ratingWuserAvg_matrix = sparse.csr_matrix(ratingWuserAvg_array)

    progress.section("Split for training")
    rtrain_implicit, rvalid_implicit, rtest_implicit, rtrain_userAvg_implicit, rvalid_userAvg_implicit, \
    rtest_userAvg_implicit, nonzero_index, rtime, item_idx_matrix_train_implicit,item_idx_matrix_valid_implicit, item_idx_matrix_test_implicit \
    = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=True,
                                                                     remove_empty=False, threshold=3,sampling=False,
                                                                     sampling_ratio=0.1, trainSampling=0.95)

    rtrain, rvalid, rtest, rtrain_userAvg, rvalid_userAvg, rtest_userAvg, nonzero_index, rtime, \
    item_idx_matrix_train,item_idx_matrix_valid, item_idx_matrix_test = time_ordered_splitModified(rating_matrix=rating_matrix,
                                                                     ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=False,
                                                                     remove_empty=False, threshold=3,
                                                                     sampling=False, sampling_ratio=0.1,
                                                                     trainSampling=0.95)

    rtrain = rtrain + rvalid + rtest
    rtrain_implicit = rtrain_implicit + rvalid_implicit + rtest_implicit

    progress.section("Get UC Matrix")
    #Get UC matrices
    U_C_matrix_explicit, U_C_matrix_implicit = get_UC_Matrix(
        I_C_matrix, rtrain_implicit)

    progress.section("Get IK Similarity")
    IK_MATRIX = ikGeneration(df)
    IK_similarity = train(IK_MATRIX)
    '''
    progress.section("Get IC Similarity")
    IC_similarity = train(I_C_matrix)
    '''

    progress.section("Get IP, IS, ID Dictionary")
    #intersection = get_intersection()
    intersection_yonge_and_finch, intersection_bloor_and_bathurst, intersection_spadina_and_dundas,\
    intersection_queen_and_spadina, intersection_bloor_and_yonge, intersection_dundas_and_yonge = get_intersection()
    IP_df, IP_dictionary = get_IP_matrix_dictionary(df, IK_similarity)
    IS_dictionary = get_IS_dictionary(df)
    #ID_dictionary = get_ID_dictionary(df,list(set(df['business_num_id'])),intersection)
    ID_dictionary_yonge_and_finch = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_yonge_and_finch)
    ID_dictionary_bloor_and_bathurst = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_bloor_and_bathurst)
    ID_dictionary_spadina_and_dundas = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_spadina_and_dundas)
    ID_dictionary_queen_and_spadina = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_queen_and_spadina)
    ID_dictionary_bloor_and_yonge = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_bloor_and_yonge)
    ID_dictionary_dundas_and_yonge = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_dundas_and_yonge)

    progress.section("user item predict")
    user_item_prediction_score = predict(rtrain,
                                         110,
                                         IK_similarity,
                                         item_similarity_en=True)
    UI_Prediction_Matrix = prediction(user_item_prediction_score, rtrain)

    progress.section("Save datafiles csv")
    save_dataframe_csv(df, args.data_dir, "Dataframe")

    progress.section("Save datafiles JSON")
    saveDictToJson(IC_dictionary,
                   args.data_dir,
                   'icDictionary',
                   trainOrTest='train')
    saveDictToJson(IP_dictionary,
                   args.data_dir,
                   'ipDictionary',
                   trainOrTest='train')
    saveDictToJson(IS_dictionary,
                   args.data_dir,
                   'isDictionary',
                   trainOrTest='train')
    #saveDictToJson(ID_dictionary, args.data_dir, 'idDictionary', trainOrTest='train')
    saveDictToJson(ID_dictionary_yonge_and_finch,
                   args.data_dir,
                   'idDictionary_yongefinch',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_bloor_and_bathurst,
                   args.data_dir,
                   'idDictionary_bloorbathurst',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_spadina_and_dundas,
                   args.data_dir,
                   'idDictionary_spadinadundas',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_queen_and_spadina,
                   args.data_dir,
                   'idDictionary_queenspadina',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_bloor_and_yonge,
                   args.data_dir,
                   'idDictionary_blooryonge',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_dundas_and_yonge,
                   args.data_dir,
                   'idDictionary_dundasyonge',
                   trainOrTest='train')

    progress.section("Save datafiles Numpy")
    save_numpy_csr(rtrain, args.data_dir, "rtrain")
    save_numpy_csr(I_C_matrix, args.data_dir, "icmatrix")
    #save_numpy(user_item_prediction_score, args.data_dir, "predictionScore")
    save_numpy(IK_similarity, args.data_dir,
               "IKbased_II_similarity")  #Tina requested for this name
    save_numpy(UI_Prediction_Matrix, args.data_dir, "UI_prediction_matrix")
    '''