Esempio n. 1
0
def main(args):
    progress = WorkSplitter()

    raw = pd.read_csv(args.path + args.name,
                      names=['user', 'item', 'rating', 'timestamp'])

    raw['userID'] = pd.factorize(raw.user)[0]
    raw['itemID'] = pd.factorize(raw.item)[0]

    progress.section("Load Raw Data")
    rating_matrix = getSparseMatrix(raw,
                                    row_name='userID',
                                    col_name='itemID',
                                    value_name='rating')
    timestamp_matrix = getSparseMatrix(raw,
                                       row_name='userID',
                                       col_name='itemID',
                                       value_name='timestamp')

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 2
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(row_name='userId',
                                col_name='itemId',
                                value_name=None,
                                path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(row_name='userId',
                                   col_name='itemId',
                                   value_name='Timestamp',
                                   path=args.path,
                                   name=args.name,
                                   shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 3
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.data_dir))
    print("Implicit User Feedback: {}".format(args.implicit))

    progress.section("Load Raw Data")
    rating_matrix, timestamp_matrix = get_yelp_df(
        args.data_dir + args.data_name,
        sampling=True,
        top_user_num=args.top_user_num,
        top_item_num=args.top_item_num)

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    import ipdb
    ipdb.set_trace()

    progress.section("Save NPZ")
    save_numpy(rtrain, args.data_dir, "Rtrain")
    save_numpy(rvalid, args.data_dir, "Rvalid")
    save_numpy(rtest, args.data_dir, "Rtest")
    save_numpy(rtime, args.data_dir, "Rtime")
    save_array(nonzero_index, args.data_dir, "Index")
def main(args):
    progress = WorkSplitter()
    progress.section("Load Raw Data")
    rating_matrix = load_pandas_without_names(
        path=args.path,
        name=args.name,
        row_name='userId',
        sep='\t',
        col_name='trackId',
        value_name='rating',
        shape=args.shape,
        names=['userId', 'trackId', 'rating'])
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index = split_seed_randomly(
        rating_matrix=rating_matrix,
        ratio=args.ratio,
        threshold=80,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    print("Done splitting Yahoo dataset")
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_array(nonzero_index, args.path, "Index")
    print("Done saving data for yahoo after splitting")
Esempio n. 5
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix, timestamp_matrix = load_netflix(path=args.folder, shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(rating_matrix=rating_matrix,
                                                                     timestamp_matrix=timestamp_matrix,
                                                                     ratio=args.ratio, implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 6
0
def main(args):
    progress = WorkSplitter()
    progress.section("Load Raw Data")
    #rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape)
    rating_matrix = load_yahoo(path=args.path,
                               name=args.name,
                               shape=args.shape)
    #timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape)
    progress.section("Split CSR Matrices")
    #rtrain, rvalid, rtest, nonzero_index = time_ordered_split(rating_matrix=rating_matrix,                                                          ratio=args.ratio, implicit=args.implicit)
    rtrain, rvalid, rtest, nonzero_index = split_seed_randomly(
        rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit)
    print("Done splitting Yahoo dataset")
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_array(nonzero_index, args.path, "Index")
    print("Done saving data for yahoo after splitting")