Beispiel #1
0
def test_load_features_data():
    dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)]

    # make dataframe for entity ids and dates
    ids_dates = create_entity_date_df(
        labels=labels,
        states=states,
        as_of_dates=dates,
        label_name="booking",
        label_type="binary",
        label_timespan="1 month",
    )

    features = [["f1", "f2"], ["f3", "f4"]]
    # make dataframes of features to test against
    features_dfs = []
    for i, table in enumerate(features_tables):
        cols = ["entity_id", "as_of_date"] + features[i]
        temp_df = pd.DataFrame(table, columns=cols)
        temp_df["as_of_date"] = convert_string_column_to_date(temp_df["as_of_date"])
        features_dfs.append(
            ids_dates.merge(
                right=temp_df, how="left", on=["entity_id", "as_of_date"]
            ).set_index(["entity_id", "as_of_date"])
        )

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(
            engine=engine, features_tables=features_tables, labels=labels, states=states
        )

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                experiment_hash=experiment_hash,
                engine=engine,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type="binary",
                label_name="booking",
                state="active",
                matrix_type="train",
                matrix_uuid="my_uuid",
                label_timespan="1 month",
            )

            feature_dictionary = dict(
                ("features{}".format(i), feature_list)
                for i, feature_list in enumerate(features)
            )

            returned_features_dfs = builder.load_features_data(
                as_of_times=dates,
                feature_dictionary=feature_dictionary,
                entity_date_table_name=entity_date_table_name,
                matrix_uuid="my_uuid",
            )

            # get the queries and test them
            for result, df in zip(returned_features_dfs, features_dfs):
                test = result == df
                assert test.all().all()
Beispiel #2
0
def test_load_features_data():
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0)
    ]

    # make dataframe for entity ids and dates
    ids_dates = create_entity_date_df(labels=labels,
                                      states=states,
                                      as_of_dates=dates,
                                      state_one=True,
                                      state_two=True,
                                      label_name='booking',
                                      label_type='binary',
                                      label_timespan='1 month')

    features = [['f1', 'f2'], ['f3', 'f4']]
    # make dataframes of features to test against
    features_dfs = []
    for i, table in enumerate(features_tables):
        cols = ['entity_id', 'as_of_date'] + features[i]
        temp_df = pd.DataFrame(table, columns=cols)
        temp_df['as_of_date'] = convert_string_column_to_date(
            temp_df['as_of_date'])
        features_dfs.append(
            ids_dates.merge(right=temp_df,
                            how='left',
                            on=['entity_id', 'as_of_date'
                                ]).set_index(['entity_id', 'as_of_date']))

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(engine=engine,
                       features_tables=features_tables,
                       labels=labels,
                       states=states)

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                engine=engine,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state='state_one AND state_two',
                matrix_type='train',
                matrix_uuid='my_uuid',
                label_timespan='1 month')

            feature_dictionary = dict(
                ('features{}'.format(i), feature_list)
                for i, feature_list in enumerate(features))

            returned_features_dfs = builder.load_features_data(
                as_of_times=dates,
                feature_dictionary=feature_dictionary,
                entity_date_table_name=entity_date_table_name,
                matrix_uuid='my_uuid')

            # get the queries and test them
            for result, df in zip(returned_features_dfs, features_dfs):
                test = (result == df)
                assert (test.all().all())