Esempio n. 1
0
def main(args):
    progress = WorkSplitter()

    raw = pd.read_csv(args.path + args.name,
                      names=['user', 'item', 'rating', 'timestamp'])

    raw['userID'] = pd.factorize(raw.user)[0]
    raw['itemID'] = pd.factorize(raw.item)[0]

    progress.section("Load Raw Data")
    rating_matrix = getSparseMatrix(raw,
                                    row_name='userID',
                                    col_name='itemID',
                                    value_name='rating')
    timestamp_matrix = getSparseMatrix(raw,
                                       row_name='userID',
                                       col_name='itemID',
                                       value_name='timestamp')

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 2
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(row_name='userId',
                                col_name='itemId',
                                value_name=None,
                                path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(row_name='userId',
                                   col_name='itemId',
                                   value_name='Timestamp',
                                   path=args.path,
                                   name=args.name,
                                   shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Esempio n. 3
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.path))
    print("Validation: {}".format(args.validation))
    print("Implicit: {}".format(args.implicit))

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(path=args.path,
                                   value_name='timestamp',
                                   name=args.name,
                                   shape=args.shape)

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, _, _, rtime = split_user_randomly(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.split_user_ratio,
        implicit=args.implicit)

    if args.validation:
        rtrain, rvalid, _, _, _ = time_ordered_split(
            rating_matrix=rtrain,
            timestamp_matrix=rtime,
            ratio=args.split_train_valid_ratio,
            implicit=False,
            remove_empty=False)

    ractive, rtest, _, _, _ = time_ordered_split(
        rating_matrix=rtest,
        timestamp_matrix=rtime,
        ratio=args.split_active_test_ratio,
        implicit=False,
        remove_empty=False)

    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(ractive, args.path, "Ractive")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
Esempio n. 4
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix, timestamp_matrix = load_netflix(path=args.folder, shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(rating_matrix=rating_matrix,
                                                                     timestamp_matrix=timestamp_matrix,
                                                                     ratio=args.ratio, implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")