def get_size(data_dir): if not exists(os.path.join(data_dir, "train_parquet")) or \ not exists(os.path.join(data_dir, "test_parquet")): raise Exception("Not train and test data parquet specified") else: train_tbl = FeatureTable.read_parquet( os.path.join(data_dir, "train_parquet")) test_tbl = FeatureTable.read_parquet( os.path.join(data_dir, "test_parquet")) # get cat sizes with tempfile.TemporaryDirectory() as local_path: get_remote_file_to_local( os.path.join(data_dir, "meta/categorical_sizes.pkl"), os.path.join(local_path, "categorical_sizes.pkl")) with open(os.path.join(local_path, "categorical_sizes.pkl"), 'rb') as f: cat_sizes_dic = pickle.load(f) indicator_sizes = [cat_sizes_dic[c] for c in indicator_cols] print("indicator sizes: ", indicator_sizes) embedding_sizes = [cat_sizes_dic[c] for c in embedding_cols] print("embedding sizes: ", embedding_sizes) cross_sizes = [cat_sizes_dic[c] for c in cross_cols] return train_tbl, test_tbl, indicator_sizes, embedding_sizes, cross_sizes
def load_dien_data(data_dir): tbl = FeatureTable.read_parquet(data_dir + "/data") windowSpec1 = Window.partitionBy("user").orderBy(desc("time")) tbl = tbl.append_column("rank1", rank().over(windowSpec1)) tbl = tbl.filter(col('rank1') == 1) train_data, test_data = tbl.split([0.8, 0.2], seed=1) usertbl = FeatureTable.read_parquet(data_dir + "/user_index/*") itemtbl = FeatureTable.read_parquet(data_dir + "/item_index/*") cattbl = FeatureTable.read_parquet(data_dir + "/category_index/*") n_uid = usertbl.get_stats("id", "max")["id"] + 1 n_mid = itemtbl.get_stats("id", "max")["id"] + 1 n_cat = cattbl.get_stats("id", "max")["id"] + 1 train_data.show() print("train size: ", train_data.size()) print("test size: ", test_data.size()) print("user size: ", n_uid) print("item size: ", n_mid) return train_data, test_data, n_uid, n_mid, n_cat
conf=conf) elif args.cluster_mode == "yarn": init_orca_context("yarn-client", cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) start = time() train_paths = [ os.path.join(args.input_train_folder, 'part-%05d.parquet' % i) for i in args.train_files ] train_tbl = FeatureTable.read_parquet(train_paths) train_tbl.df.printSchema() test_tbl = FeatureTable.read_parquet(args.input_test_folder) train_tbl = preprocess(train_tbl) test_tbl = preprocess(test_tbl) train_tbl, language_idx = train_tbl.category_encode("language") test_tbl = test_tbl.encode_string("language", language_idx) user_index = train_tbl.gen_string_idx({ 'src_cols': ['engaged_with_user_id', 'enaging_user_id'], 'col_name': 'user_id' })
driver_cores=args.driver_cores, driver_memory=args.driver_memory, init_ray_on_spark=True) elif args.cluster_mode == "yarn": sc = init_orca_context("yarn-client", cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, object_store_memory="10g", init_ray_on_spark=True) elif args.cluster_mode == "spark-submit": sc = init_orca_context("spark-submit") movielens_data = movielens.get_id_ratings("/tmp/movielens/") pddf = pd.DataFrame(movielens_data, columns=["user", "item", "label"]) num_users, num_items = pddf["user"].max() + 1, pddf["item"].max() + 1 full = FeatureTable.from_pandas(pddf)\ .apply("label", "label", lambda x: x - 1, 'int') train, test = full.random_split([0.8, 0.2], seed=1) config = {"lr": 1e-3, "inter_op_parallelism": 4, "intra_op_parallelism": args.executor_cores} def model_creator(config): model = build_model(num_users, num_items, 5) print(model.summary()) optimizer = tf.keras.optimizers.Adam(config["lr"]) model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_crossentropy', 'accuracy']) return model steps_per_epoch = math.ceil(train.size() / args.batch_size) val_steps = math.ceil(test.size() / args.batch_size)
memory=args.executor_memory) elif args.cluster_mode == "standalone": init_orca_context("standalone", master=args.master, cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) elif args.cluster_mode == "yarn": init_orca_context("yarn-client", cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) start = time() val_tbl = FeatureTable.read_csv(args.input_file, delimiter="\x01", names=RecsysSchema().toColumns(), dtype=RecsysSchema().toDtype()) val_tbl.df.printSchema() val_tbl.write_parquet(args.output_folder) end = time() print("Convert to parquet time: ", end - start) stop_orca_context()
cores=args.cores, num_nodes=args.num_nodes, memory=args.memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) elif args.cluster_mode == "spark-submit": init_orca_context("spark-submit") time_start = time() paths = [ os.path.join(args.input_folder, "day_%d.parquet" % i) for i in args.day_range ] tbl = FeatureTable.read_parquet(paths) idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit) if args.days == 24: # Full Criteo dataset # Exclude the last path day_23.parquet since the first half of day_23 is separated for test. train_data = FeatureTable.read_parquet(paths[:-1]) train_preprocessed = preprocess_and_save(train_data, idx_list, "train", args.output_folder) test_data = FeatureTable.read_parquet( os.path.join(args.input_folder, "day_23_test.parquet")) test_preprocessed = preprocess_and_save(test_data, idx_list, "test", args.output_folder) else: train_data = FeatureTable.read_parquet(paths) train_preprocessed = preprocess_and_save(train_data, idx_list, "train", args.output_folder)
"len_links", "hashtags", "present_links", "present_domains" ] cat_cols = [ "engaged_with_user_is_verified", "enaging_user_is_verified", "present_media", "tweet_type", "language" ] ratio_cols = [ "engaged_with_user_follower_following_ratio", "enaging_user_follower_following_ratio" ] embed_cols = [ "enaging_user_id", "engaged_with_user_id", "hashtags", "present_links", "present_domains" ] useful_cols = num_cols + cat_cols + embed_cols train_tbl = FeatureTable.read_parquet(args.data_dir + "/train_parquet") test_tbl = FeatureTable.read_parquet(args.data_dir + "/test_parquet") full_tbl = train_tbl.concat(test_tbl, "outer") reindex_tbls = full_tbl.gen_reindex_mapping( embed_cols, freq_limit=args.frequency_limit) train_tbl, test_tbl, user_info, item_info = prepare_features( train_tbl, test_tbl, reindex_tbls) output_dir = args.data_dir + "/embed_reindex" for i, c in enumerate(embed_cols): reindex_tbls[i].write_parquet(output_dir + "_" + c) train_config = { "lr": 1e-3, "user_col_info": user_info, "item_col_info": item_info,
driver_memory=args.driver_memory, conf=conf) elif args.cluster_mode == "yarn": init_orca_context("yarn-client", cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) elif args.cluster_mode == "spark-submit": init_orca_context("spark-submit") begin = time.time() transaction_tbl = FeatureTable.read_json(args.input_transaction).select( ['reviewerID', 'asin', 'unixReviewTime']) \ .rename({'reviewerID': 'user', 'asin': 'item', 'unixReviewTime': 'time'}) \ .dropna(columns=['user', 'item']) transaction_tbl.cache() print("transaction_tbl, ", transaction_tbl.size()) item_tbl = FeatureTable.read_csv(args.input_meta, delimiter="\t", names=['item', 'category'])\ .apply("category", "category", lambda x: x.lower() if x is not None else "default") item_tbl.cache() print("item_tbl, ", item_tbl.size()) user_index = transaction_tbl.gen_string_idx('user', freq_limit=1) item_cat_indices = item_tbl.gen_string_idx(["item", "category"], freq_limit=1) item_size = item_cat_indices[0].size() item_tbl = item_tbl\