def test_load_labels_data(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make a dataframe of labels to test against labels_df = pd.DataFrame(labels, columns=[ 'entity_id', 'as_of_date', 'label_timespan', 'label_name', 'label_type', 'label' ]) labels_df['as_of_date'] = convert_string_column_to_date( labels_df['as_of_date']) labels_df.set_index(['entity_id', 'as_of_date']) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan='1 month', matrix_uuid='my_uuid', entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict({ 'entity_id': [2, 3, 4, 4], 'as_of_date': [dates[1], dates[1], dates[0], dates[1]], 'booking': [0, 0, 1, 0], }).set_index(['entity_id', 'as_of_date']) test = (result == df) assert (test.all().all())
def test_load_labels_data_include_missing_labels_as_false(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0), ] # same as the other load_labels_data test, except we include an extra date, 2016-06-01 # this date does have entity 0 included via the states table, but no labels # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns=[ "entity_id", "as_of_date", "label_timespan", "label_name", "label_type", "label", ], ) labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"]) labels_df.set_index(["entity_id", "as_of_date"]) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, include_missing_labels_in_train_as=False, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan="1 month", matrix_uuid="my_uuid", entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict( { "entity_id": [0, 2, 3, 4, 4], "as_of_date": [dates[2], dates[1], dates[1], dates[0], dates[1]], "booking": [0, 0, 0, 1, 0], } ).set_index(["entity_id", "as_of_date"]) # the first row would not be here if we had not configured the Builder # to include missing labels as false test = result == df assert test.all().all()
def test_load_labels_data(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns=[ "entity_id", "as_of_date", "label_timespan", "label_name", "label_type", "label", ], ) labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"]) labels_df.set_index(["entity_id", "as_of_date"]) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan="1 month", matrix_uuid="my_uuid", entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict( { "entity_id": [2, 3, 4, 4], "as_of_date": [dates[1], dates[1], dates[0], dates[1]], "booking": [0, 0, 1, 0], } ).set_index(["entity_id", "as_of_date"]) test = result == df assert test.all().all()