def build_pipeline(column_names, data_geo): print("Now building pipeline...") # initialize the pipeline pipeline = filter_on_jurisdiction() # get rid of duplicates pipeline += pdp.AdHocStage(transform=get_duplicate_rows) # uniformize missing values pipeline += uniformize_missing(column_names) # add neighbourhood of data points based on latitude and longitude kwargs = {'geo_data': data_geo} pipeline += AdHogStageArg(transform=add_neighbourhood, **kwargs) # drop rows that do not belong to Amsterdam, according to lat lon pipeline += filter_on_neighbourhood() # uniformize boolean values pipeline += uniformize_boolean(column_names) # uniformize datetimes pipeline += pdp.AdHocStage(transform=unify_datetimes) # uniformize monetary values and convert to numeric pipeline += uniformize_monetary() # uniformize percentage values and convert to numeric pipeline += uniformize_percentage() # add new boolean columns for values within certain columns pipeline += pdp.AdHocStage(transform=expand_columns) # drop columns that contain no useful information pipeline += drop_useless() return pipeline
def build_pipeline(data_calendar): # PREPROCESSING STAGES print("Now building pipeline...") # initialize the pipeline by dropping textual columns pipeline = drop_textual_columns() # drop the host location column pipeline += drop_host_location() # drop geometrical columns pipeline += drop_geometrical_columns() # drop useless 'amenities' and 'host_verifications' columns pipeline += pdp.AdHocStage(transform=filter_verifications_and_amenities) # convert the column 'calender_updated' from string to numeric print("Skip calendar step!") pipeline += drop_calendar_updated() #pipeline += pdp.AdHocStage(transform=convert_calender_updated_to_numeric) # drop bed_type and property_type columns pipeline += pdp.AdHocStage(transform=drop_bed_and_property_type_columns) # encode 'host_response_time' as an ordinal variable (0-4) pipeline += pdp.AdHocStage(transform=encode_host_response_time_as_ordinal) # label encode the host neighbourhood column pipeline += pdp.AdHocStage(transform=label_encode_host_neighbourhood) # dummy encode some nominal columns pipeline += pdp.AdHocStage(transform=dummy_encode_nominal_columns) # convert column types to float pipeline += pdp.AdHocStage(transform=convert_all_to_numeric) # drop columns related to date/time pipeline += drop_datetime_columns() # set threshold for dropping columns; columns with proportion of MaNs > threshold will be dropped drop_threshold = 0.5 kwargs = {'drop_threshold': drop_threshold} pipeline += AdHogStageArg(transform=drop_columns_with_many_NaNs, **kwargs) # mean impute remaining continuous columns pipeline += pdp.AdHocStage(transform=mean_impute) # mode impute remaining binary columns pipeline += pdp.AdHocStage(transform=mode_impute) # MERGE STAGES # pca transform the data pipeline += pdp.AdHocStage(transform=pca_transform) # drop rows with missing IDs pipeline += pdp.AdHocStage(transform=drop_missing_ids) # merge data with calendar data kwargs = {'data_calendar': data_calendar} pipeline += AdHogStageArg(transform=merge_with_calendar, **kwargs) return pipeline
def get_stage_engarde(line, stage_config: cf.ConfigView) -> pdp.PdPipelineStage: """ Get an 'engarde' stage from `config.yaml`. Example ------- Given the following segment of `config.yaml`, config.yaml ... pipelines: ... example_pipeline: - type: check function: none_missing staging: desc: Checks that there are no missing values (NaNs). ... ... return a `pdpipe.AdHocStage` that applies the check engarde.checks.none_missing` to the dataframe. When the pipeline containing this stage is printed, the stage will appear as `[X] Checks that there are no missing values (NaNs).`. """ (check, kwargs, staging) = get_stage_parameters(engc, stage_config, "check") function = df_copy(partial(check, **kwargs)) stage = pdp.AdHocStage(function, **staging) return stage
def _original_code(): start = time.time() salesdata = pd.read_csv("processed_salesdata.csv") pline = pdp.PdPipeline([ pdp.Schematize(COLUMNS), pdp.ApplyByCols("category_group", lambda x: "tops" if x == "tops" else "other"), pdp.ApplyByCols(["date", "shelf_date", "end_of_season"], pd.to_datetime), pdp.ApplyToRows(lambda row: pd.Series({ "standard_amount": row["original_price"] * row["sales"], "sales_discount": 0 if (row["original_price"] * row["sales"] <= 0) else row[ "sales_amount"] / ((row["original_price"] * row["sales"])), "week": int(row["date"].strftime('%W')), "days_on_counter": (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'), "life_cycle": (row["end_of_season"] - row["shelf_date"]) / (np.timedelta64(1, 'D')), "C1": 1 if row["category_group"] == "tops" else 0, "C2": 1 if row["category_group"] == "other" else 0, "sales": 0 if row["sales"] < 0 else row["sales"], "passenger_flow": 0 if row["passenger_flow"] < 0 else (row["passenger_flow"]), "plus_purchase": 0 if row["plus_purchase"] < 0 else (row["plus_purchase"]), })), pdp.AdHocStage( lambda df: df[df["days_on_counter"] <= df["life_cycle"]]), pdp.ColDrop("activity_level") ]) salesdata = pline.apply(salesdata, verbose=True, exraise=True) salesdata_cumitems = salesdata[[ "SKC", "date", "sales", "passenger_flow", "plus_purchase" ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum() salesdata_cumitems.columns = [ "total_sales", "total_passenger_flow", "total_plus_purchase" ] salesdata["total_sales"] = salesdata_cumitems["total_sales"] salesdata["total_passenger_flow"] = salesdata_cumitems[ "total_passenger_flow"] salesdata["total_plus_purchase"] = salesdata_cumitems[ "total_plus_purchase"] print("consumed time(s)=", time.time() - start)
def get_stage_verify(line, stage_config: cf.ConfigView) -> pdp.PdPipelineStage: """ Get a "verify" stage from `config.yaml`. Example ------- Given `high_enough` in a module passed to the `Line` constructor, def high_enough(df, col_name, val): return df.loc[:, col_name] > val and the following segment of `config.yaml`, config.yaml ... pipelines: ... example_pipeline: ... - type: verify_all check: high_enough kwargs: col_name: prices val: 19 staging: desc: Checks whether all prices are over $19. ... ... return a `pdpipe.AdHocStage` that applies the check `high_enough` using `engarde.checks.verify_all` to the dataframe with the arguments specified in `config.yaml`. When the pipeline containing this stage is printed, the stage will appear as `[X] Checks whether all prices are over $19.`. Additionally, `type: verify_any` could've been supplied instead of `type: verify_all`. """ (check, kwargs, staging) = get_stage_parameters(line.module, stage_config, "check") (verify, _, _) = get_stage_parameters(engc, stage_config, "type") function = df_copy(partial(verify, check=check, **kwargs)) stage = pdp.AdHocStage(function, **staging) return stage
def get_stage_transform(line, stage_config: cf.ConfigView) -> pdp.PdPipelineStage: """ Get a "transform" stage from `config.yaml`. Example ------- Given `add_to_col` in a module passed to the `Line` constructor, def add_to_col(df, col_name, val): df.loc[:, col_name] = df.loc[:, col_name] + val return df and the following segment of `config.yaml`, config.yaml ... pipelines: ... example_pipeline: ... - type: transform function: add_to_col kwargs: col_name: prices val: 1.5 staging: desc: Adds 1.5 to column 'prices' exmsg: Couldn't add to 'prices'. ... ... return a `pdpipe.AdHocStage` that applies `add_to_col` to the dataframe with the arguments specified in `config.yaml`. When the pipeline containing this stage is printed, the stage will appear as `[X] Adds 1.5 to column 'prices'`. If the stage fails, the `exmsg`, `Couldn't add to 'prices'.`, is relayed to the user. """ (function, kwargs, staging) = get_stage_parameters( line.module, stage_config, "function" ) function = df_copy(partial(function, **kwargs)) stage = pdp.AdHocStage(function, **staging) return stage