def __rshift__(self, operator): if callable(operator) and not (isinstance(operator, type) and issubclass(operator, Operator)): # implicit lambdaop conversion. operator = LambdaOp(operator) return super().__rshift__(operator)
def __rshift__(self, operator): """Transforms this ColumnGroup by applying an Operator Parameters ----------- operators: Operator or callable Returns ------- ColumnGroup """ if isinstance(operator, type) and issubclass(operator, Operator): # handle case where an operator class is passed operator = operator() elif callable(operator): # implicit lambdaop conversion. operator = LambdaOp(operator) if not isinstance(operator, Operator): raise ValueError( f"Expected operator or callable, got {operator.__class__}") child = ColumnGroup(operator.output_column_names(self.columns)) child.parents = [self] self.children.append(child) child.op = operator dependencies = operator.dependencies() if dependencies: child.dependencies = set() if not isinstance(dependencies, collections.abc.Sequence): dependencies = [dependencies] for dependency in dependencies: if not isinstance(dependency, ColumnGroup): dependency = ColumnGroup(dependency) dependency.children.append(child) child.parents.append(dependency) child.dependencies.add(dependency) return child
def test_nested_workflow_node(): df = dispatch._make_df({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) dataset = Dataset(df) geo_selector = ColumnSelector(["geo"]) country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1") # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2") user = "******" # user2 = "user2" # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = country + user + [country + user] >> Categorify(encode_type="combo") workflow = Workflow(cats) workflow.fit_schema(dataset.infer_schema()) df_out = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")
def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec, devices, local_directory): rmm.reinitialize(managed_memory=False) documents_categories_path = os.path.join(data_bucket_folder, 'documents_categories.csv') documents_topics_path = os.path.join(data_bucket_folder, 'documents_topics.csv') documents_entities_path = os.path.join(data_bucket_folder, 'documents_entities.csv') documents_categories_cudf = cudf.read_csv(documents_categories_path) documents_topics_cudf = cudf.read_csv(documents_topics_path) documents_entities_cudf = cudf.read_csv(documents_entities_path) documents_entities_cudf['entity_id'] = documents_entities_cudf[ 'entity_id'].astype('category').cat.codes categories = _df_to_coo(documents_categories_cudf, col='category_id') topics = _df_to_coo(documents_topics_cudf, col='topic_id') entities = _df_to_coo(documents_entities_cudf, col='entity_id') del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf ctr_thresh = { 'ad_id': 5, 'source_id_promo': 10, 'publisher_id_promo': 10, 'advertiser_id': 10, 'campaign_id': 10, 'document_id_promo': 5, } client = create_client(devices=devices, local_directory=local_directory) workflow = nvt.Workflow(cat_names=CATEGORICAL_COLUMNS, cont_names=CONTINUOUS_COLUMNS, label_name=['clicked'], client=client) workflow.add_feature([ LambdaOp(op_name='country', f=lambda col, gdf: col.str.slice(0, 2), columns=['geo_location'], replace=False), LambdaOp(op_name='state', f=lambda col, gdf: col.str.slice(0, 5), columns=['geo_location'], replace=False), LambdaOp(op_name='days_since_published', f=_calculate_delta, columns=['publish_time', 'publish_time_promo'], replace=False), FillMedian(columns=[ 'publish_time_days_since_published', 'publish_time_promo_days_since_published' ]), JoinGroupby(columns=[ 'ad_id', 'source_id_promo', 'document_id_promo', 'publisher_id_promo', 'advertiser_id', 'campaign_id' ], cont_names=['clicked'], out_path=output_bucket_folder, stats=['sum', 'count']), LambdaOp(op_name='ctr', f=lambda col, gdf: ((col) / (gdf[col.name.replace('_clicked_sum', '_count')])).where( gdf[col.name.replace('_clicked_sum', '_count')] >= ctr_thresh[col.name.replace('_clicked_sum', '')], 0), columns=[ 'ad_id_clicked_sum', 'source_id_promo_clicked_sum', 'document_id_promo_clicked_sum', 'publisher_id_promo_clicked_sum', 'advertiser_id_clicked_sum', 'campaign_id_clicked_sum' ], replace=False), FillMissing(columns=groupby_columns + ctr_columns), LogOp(columns=groupby_columns + [ 'publish_time_days_since_published', 'publish_time_promo_days_since_published' ]), Normalize(columns=groupby_columns), ColumnSimilarity('doc_event_doc_ad_sim_categories', 'document_id', categories, 'document_id_promo', metric='tfidf', on_device=False), ColumnSimilarity('doc_event_doc_ad_sim_topics', 'document_id', topics, 'document_id_promo', metric='tfidf', on_device=False), ColumnSimilarity('doc_event_doc_ad_sim_entities', 'document_id', entities, 'document_id_promo', metric='tfidf', on_device=False) ]) workflow.add_cat_preprocess([HashBucket(hash_spec)]) workflow.finalize() return workflow
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask): rmm.reinitialize(managed_memory=False) documents_categories_path = os.path.join(data_bucket_folder, "documents_categories.csv") documents_topics_path = os.path.join(data_bucket_folder, "documents_topics.csv") documents_entities_path = os.path.join(data_bucket_folder, "documents_entities.csv") documents_categories_cudf = cudf.read_csv(documents_categories_path) documents_topics_cudf = cudf.read_csv(documents_topics_path) documents_entities_cudf = cudf.read_csv(documents_entities_path) documents_entities_cudf["entity_id"] = ( documents_entities_cudf["entity_id"].astype("category").cat.codes) categories = _df_to_coo(documents_categories_cudf, col="category_id") topics = _df_to_coo(documents_topics_cudf, col="topic_id") entities = _df_to_coo(documents_entities_cudf, col="entity_id") del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf ctr_thresh = { "ad_id": 5, "source_id_promo": 10, "publisher_id_promo": 10, "advertiser_id": 10, "campaign_id": 10, "document_id_promo": 5, } ctr_inputs = ColumnGroup(CTR_INPUTS) cat_cols = ColumnGroup(CATEGORICAL_COLUMNS) geo_location = ColumnGroup(["geo_location"]) country = (geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) state = (geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")) geo_features = geo_location + country + state dates = ["publish_time", "publish_time_promo"] date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) ctr_cols = (stat_cols - [ column + "_count" for column in ctr_inputs.flattened_columns ] >> LambdaOp( f=lambda col, gdf: ((col) / (gdf[col.name.replace("_clicked_sum", "_count")])).where( gdf[col.name.replace("_clicked_sum", "_count")] >= ctr_thresh[ col.name.replace("_clicked_sum", "")], 0, ), dependency=stat_cols - [column + "clicked_sum" for column in ctr_inputs.flattened_columns], ) >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))) stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize() ctr_cols = ctr_cols >> FillMissing() cat_cols = cat_cols + geo_features >> HashBucket(hash_spec) features = (date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]) sim_features_categ = ( [["document_id", "document_id_promo"]] >> ColumnSimilarity( categories, metric="tfidf", on_device=False) >> Rename(postfix="_categories")) sim_features_topics = ( [["document_id", "document_id_promo"] ] >> ColumnSimilarity(topics, metric="tfidf", on_device=False) >> Rename(postfix="_topics")) sim_features_entities = ( [["document_id", "document_id_promo"] ] >> ColumnSimilarity(entities, metric="tfidf", on_device=False) >> Rename(postfix="_entities")) sim_features = sim_features_categ + sim_features_topics + sim_features_entities client = create_client(devices=devices, local_directory=local_directory) if dask else None workflow = nvt.Workflow(column_group=features + sim_features, client=client) return workflow
def preprocess_criteo_parquet( input_path: str, output_path: str, client, frequency_threshold: int, ): train_days = [str(x) for x in CRITEO_TRAIN_DAYS] train_files = [ os.path.join(input_path, x) for x in os.listdir(input_path) if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days ] valid_file = os.path.join(input_path, "day_23.part2.parquet") test_file = os.path.join(input_path, "day_23.part1.parquet") all_set = train_files + [valid_file] + [test_file] print(all_set, train_files, valid_file, test_file) print("Creating Workflow Object") workflow = Workflow(cat_names=CRITEO_CATEGORICAL_COLUMNS, cont_names=CRITEO_CONTINUOUS_COLUMNS, label_name=CRITEO_CLICK_COLUMNS) # We want to assign 0 to all missing values, and calculate log(x+3) for present values # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0 workflow.add_cont_feature([ FillMissing(fill_val=-2.0), LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne', f=lambda col, _: col.add(2.0)), LogOp(), # Log(1+x) ]) workflow.add_cat_preprocess( Categorify(freq_threshold=frequency_threshold, out_path=output_path)) workflow.finalize() print("Creating Dataset Iterator") all_ds = Dataset(all_set, engine="parquet", part_mem_fraction=ALL_DS_MEM_FRAC) trains_ds = Dataset(train_files, engine="parquet", part_mem_fraction=TRAIN_DS_MEM_FRAC) valid_ds = Dataset(valid_file, engine="parquet", part_mem_fraction=TEST_DS_MEM_FRAC) test_ds = Dataset(test_file, engine="parquet", part_mem_fraction=VALID_DS_MEM_FRAC) print("Running apply") out_train = os.path.join(output_path, "train") out_valid = os.path.join(output_path, "validation") out_test = os.path.join(output_path, "test") start = time() workflow.update_stats(all_ds) print(f"Gathering statistics time: {time() - start}") start = time() workflow.apply(trains_ds, record_stats=False, output_path=out_train) print(f"train preprocess time: {time() - start}") start = time() workflow.apply(valid_ds, record_stats=False, output_path=out_valid) print(f"valid preprocess time: {time() - start}") start = time() workflow.apply(test_ds, record_stats=False, output_path=out_test) print(f"test preprocess time: {time() - start}") save_model_size_config(workflow, output_path)