Beispiel #1
0
def build_pipeline(column_names, data_geo):
    print("Now building pipeline...")
    # initialize the pipeline
    pipeline = filter_on_jurisdiction()
    # get rid of duplicates
    pipeline += pdp.AdHocStage(transform=get_duplicate_rows)
    # uniformize missing values
    pipeline += uniformize_missing(column_names)
    # add neighbourhood of data points based on latitude and longitude
    kwargs = {'geo_data': data_geo}
    pipeline += AdHogStageArg(transform=add_neighbourhood, **kwargs)
    # drop rows that do not belong to Amsterdam, according to lat lon
    pipeline += filter_on_neighbourhood()
    # uniformize boolean values
    pipeline += uniformize_boolean(column_names)
    # uniformize datetimes
    pipeline += pdp.AdHocStage(transform=unify_datetimes)
    # uniformize monetary values and convert to numeric
    pipeline += uniformize_monetary()
    # uniformize percentage values and convert to numeric
    pipeline += uniformize_percentage()
    # add new boolean columns for values within certain columns
    pipeline += pdp.AdHocStage(transform=expand_columns)
    # drop columns that contain no useful information
    pipeline += drop_useless()

    return pipeline
Beispiel #2
0
def build_pipeline(data_calendar):

    # PREPROCESSING STAGES

    print("Now building pipeline...")
    # initialize the pipeline by dropping textual columns
    pipeline = drop_textual_columns()
    # drop the host location column
    pipeline += drop_host_location()
    # drop geometrical columns
    pipeline += drop_geometrical_columns()
    # drop useless 'amenities' and 'host_verifications' columns
    pipeline += pdp.AdHocStage(transform=filter_verifications_and_amenities)
    # convert the column 'calender_updated' from string to numeric
    print("Skip calendar step!")
    pipeline += drop_calendar_updated()
    #pipeline += pdp.AdHocStage(transform=convert_calender_updated_to_numeric)
    # drop bed_type and property_type columns
    pipeline += pdp.AdHocStage(transform=drop_bed_and_property_type_columns)
    # encode 'host_response_time' as an ordinal variable (0-4)
    pipeline += pdp.AdHocStage(transform=encode_host_response_time_as_ordinal)
    # label encode the host neighbourhood column
    pipeline += pdp.AdHocStage(transform=label_encode_host_neighbourhood)
    # dummy encode some nominal columns
    pipeline += pdp.AdHocStage(transform=dummy_encode_nominal_columns)
    # convert column types to float
    pipeline += pdp.AdHocStage(transform=convert_all_to_numeric)
    # drop columns related to date/time
    pipeline += drop_datetime_columns()
    # set threshold for dropping columns; columns with proportion of MaNs > threshold will be dropped
    drop_threshold = 0.5
    kwargs = {'drop_threshold': drop_threshold}
    pipeline += AdHogStageArg(transform=drop_columns_with_many_NaNs, **kwargs)
    # mean impute remaining continuous columns
    pipeline += pdp.AdHocStage(transform=mean_impute)
    # mode impute remaining binary columns
    pipeline += pdp.AdHocStage(transform=mode_impute)

    # MERGE STAGES

    # pca transform the data
    pipeline += pdp.AdHocStage(transform=pca_transform)
    # drop rows with missing IDs
    pipeline += pdp.AdHocStage(transform=drop_missing_ids)
    # merge data with calendar data
    kwargs = {'data_calendar': data_calendar}
    pipeline += AdHogStageArg(transform=merge_with_calendar, **kwargs)

    return pipeline
Beispiel #3
0
def get_stage_engarde(line, stage_config: cf.ConfigView) -> pdp.PdPipelineStage:
    """
    Get an 'engarde' stage from `config.yaml`.

    Example
    -------

    Given the following segment of `config.yaml`,

        config.yaml
            ...
            pipelines:
              ...
              example_pipeline:
                - type: check
                  function: none_missing
                  staging:
                    desc: Checks that there are no missing values (NaNs).
                ...
            ...

    return a `pdpipe.AdHocStage` that applies the check engarde.checks.none_missing` to
    the dataframe. When the pipeline containing this stage is printed, the stage will
    appear as `[X] Checks that there are no missing values (NaNs).`.
    """

    (check, kwargs, staging) = get_stage_parameters(engc, stage_config, "check")
    function = df_copy(partial(check, **kwargs))
    stage = pdp.AdHocStage(function, **staging)
    return stage
Beispiel #4
0
def _original_code():
    start = time.time()
    salesdata = pd.read_csv("processed_salesdata.csv")
    pline = pdp.PdPipeline([
        pdp.Schematize(COLUMNS),
        pdp.ApplyByCols("category_group", lambda x: "tops"
                        if x == "tops" else "other"),
        pdp.ApplyByCols(["date", "shelf_date", "end_of_season"],
                        pd.to_datetime),
        pdp.ApplyToRows(lambda row: pd.Series({
            "standard_amount":
            row["original_price"] * row["sales"],
            "sales_discount":
            0 if (row["original_price"] * row["sales"] <= 0) else row[
                "sales_amount"] / ((row["original_price"] * row["sales"])),
            "week":
            int(row["date"].strftime('%W')),
            "days_on_counter":
            (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'),
            "life_cycle": (row["end_of_season"] - row["shelf_date"]) /
            (np.timedelta64(1, 'D')),
            "C1":
            1 if row["category_group"] == "tops" else 0,
            "C2":
            1 if row["category_group"] == "other" else 0,
            "sales":
            0 if row["sales"] < 0 else row["sales"],
            "passenger_flow":
            0 if row["passenger_flow"] < 0 else (row["passenger_flow"]),
            "plus_purchase":
            0 if row["plus_purchase"] < 0 else (row["plus_purchase"]),
        })),
        pdp.AdHocStage(
            lambda df: df[df["days_on_counter"] <= df["life_cycle"]]),
        pdp.ColDrop("activity_level")
    ])
    salesdata = pline.apply(salesdata, verbose=True, exraise=True)

    salesdata_cumitems = salesdata[[
        "SKC", "date", "sales", "passenger_flow", "plus_purchase"
    ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum()
    salesdata_cumitems.columns = [
        "total_sales", "total_passenger_flow", "total_plus_purchase"
    ]
    salesdata["total_sales"] = salesdata_cumitems["total_sales"]
    salesdata["total_passenger_flow"] = salesdata_cumitems[
        "total_passenger_flow"]
    salesdata["total_plus_purchase"] = salesdata_cumitems[
        "total_plus_purchase"]
    print("consumed time(s)=", time.time() - start)
Beispiel #5
0
def get_stage_verify(line, stage_config: cf.ConfigView) -> pdp.PdPipelineStage:
    """
    Get a "verify" stage from `config.yaml`.

    Example
    -------

    Given `high_enough` in a module passed to the `Line` constructor,

        def high_enough(df, col_name, val):
            return df.loc[:, col_name] > val

    and the following segment of `config.yaml`,

        config.yaml
            ...
            pipelines:
              ...
              example_pipeline:
                ...
                - type: verify_all
                  check: high_enough
                  kwargs:
                    col_name: prices
                    val: 19
                  staging:
                    desc: Checks whether all prices are over $19.
                ...
            ...

    return a `pdpipe.AdHocStage` that applies the check `high_enough` using
    `engarde.checks.verify_all` to the dataframe with the arguments specified in
    `config.yaml`. When the pipeline containing this stage is printed, the stage will
    appear as `[X] Checks whether all prices are over $19.`.

    Additionally, `type: verify_any` could've been supplied instead of `type:
    verify_all`.
    """

    (check, kwargs, staging) = get_stage_parameters(line.module, stage_config, "check")
    (verify, _, _) = get_stage_parameters(engc, stage_config, "type")
    function = df_copy(partial(verify, check=check, **kwargs))
    stage = pdp.AdHocStage(function, **staging)
    return stage
Beispiel #6
0
def get_stage_transform(line, stage_config: cf.ConfigView) -> pdp.PdPipelineStage:
    """
    Get a "transform" stage from `config.yaml`.

    Example
    -------

    Given `add_to_col` in a module passed to the `Line` constructor,

        def add_to_col(df, col_name, val):
            df.loc[:, col_name] = df.loc[:, col_name] + val
            return df

    and the following segment of `config.yaml`,

        config.yaml
            ...
            pipelines:
              ...
              example_pipeline:
                ...
                - type: transform
                  function: add_to_col
                  kwargs:
                    col_name: prices
                    val: 1.5
                  staging:
                    desc: Adds 1.5 to column 'prices'
                    exmsg: Couldn't add to 'prices'.
                ...
            ...

    return a `pdpipe.AdHocStage` that applies `add_to_col` to the dataframe with the
    arguments specified in `config.yaml`. When the pipeline containing this stage is
    printed, the stage will appear as `[X] Adds 1.5 to column 'prices'`. If the stage
    fails, the `exmsg`, `Couldn't add to 'prices'.`, is relayed to the user.
    """

    (function, kwargs, staging) = get_stage_parameters(
        line.module, stage_config, "function"
    )
    function = df_copy(partial(function, **kwargs))
    stage = pdp.AdHocStage(function, **staging)
    return stage