def main(args): progress = WorkSplitter() raw = pd.read_csv(args.path + args.name, names=['user', 'item', 'rating', 'timestamp']) raw['userID'] = pd.factorize(raw.user)[0] raw['itemID'] = pd.factorize(raw.item)[0] progress.section("Load Raw Data") rating_matrix = getSparseMatrix(raw, row_name='userID', col_name='itemID', value_name='rating') timestamp_matrix = getSparseMatrix(raw, row_name='userID', col_name='itemID', value_name='timestamp') progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix = load_pandas(row_name='userId', col_name='itemId', value_name=None, path=args.path, name=args.name, shape=args.shape) timestamp_matrix = load_pandas(row_name='userId', col_name='itemId', value_name='Timestamp', path=args.path, name=args.name, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit, sampling=True, percentage=0.2) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.data_dir)) print("Implicit User Feedback: {}".format(args.implicit)) progress.section("Load Raw Data") rating_matrix, timestamp_matrix = get_yelp_df( args.data_dir + args.data_name, sampling=True, top_user_num=args.top_user_num, top_item_num=args.top_item_num) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) import ipdb ipdb.set_trace() progress.section("Save NPZ") save_numpy(rtrain, args.data_dir, "Rtrain") save_numpy(rvalid, args.data_dir, "Rvalid") save_numpy(rtest, args.data_dir, "Rtest") save_numpy(rtime, args.data_dir, "Rtime") save_array(nonzero_index, args.data_dir, "Index")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix = load_pandas_without_names( path=args.path, name=args.name, row_name='userId', sep='\t', col_name='trackId', value_name='rating', shape=args.shape, names=['userId', 'trackId', 'rating']) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index = split_seed_randomly( rating_matrix=rating_matrix, ratio=args.ratio, threshold=80, implicit=args.implicit, sampling=True, percentage=0.2) print("Done splitting Yahoo dataset") progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_array(nonzero_index, args.path, "Index") print("Done saving data for yahoo after splitting")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix, timestamp_matrix = load_netflix(path=args.folder, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") #rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape) rating_matrix = load_yahoo(path=args.path, name=args.name, shape=args.shape) #timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape) progress.section("Split CSR Matrices") #rtrain, rvalid, rtest, nonzero_index = time_ordered_split(rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit) rtrain, rvalid, rtest, nonzero_index = split_seed_randomly( rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit) print("Done splitting Yahoo dataset") progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_array(nonzero_index, args.path, "Index") print("Done saving data for yahoo after splitting")