def test_make_entity_date_table(): """ Test that the make_entity_date_table function contains the correct values. """ dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name="booking", label_type="binary", label_timespan="1 month", ) with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) engine.execute( "CREATE TABLE features.tmp_entity_date (a int, b date);") # call the function to test the creation of the table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="state_one AND state_two", matrix_uuid="my_uuid", matrix_type="train", label_timespan="1 month", ) # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date". format(entity_date_table_name), engine, ) # compare the table to the test dataframe test = result == ids_dates assert test.all().all()
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, replace=False) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'test_duration': '1 month', 'indices': ['entity_id', 'as_of_date'], } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert not builder.make_entity_date_table.called
def test_load_labels_data(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make a dataframe of labels to test against labels_df = pd.DataFrame(labels, columns=[ 'entity_id', 'as_of_date', 'label_timespan', 'label_name', 'label_type', 'label' ]) labels_df['as_of_date'] = convert_string_column_to_date( labels_df['as_of_date']) labels_df.set_index(['entity_id', 'as_of_date']) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan='1 month', matrix_uuid='my_uuid', entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict({ 'entity_id': [2, 3, 4, 4], 'as_of_date': [dates[1], dates[1], dates[0], dates[1]], 'booking': [0, 0, 1, 0], }).set_index(['entity_id', 'as_of_date']) test = (result == df) assert (test.all().all())
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=False, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = filename_friendly_hash(matrix_metadata) builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert not builder.make_entity_date_table.called
def test_load_labels_data_include_missing_labels_as_false(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0), ] # same as the other load_labels_data test, except we include an extra date, 2016-06-01 # this date does have entity 0 included via the states table, but no labels # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns=[ "entity_id", "as_of_date", "label_timespan", "label_name", "label_type", "label", ], ) labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"]) labels_df.set_index(["entity_id", "as_of_date"]) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, include_missing_labels_in_train_as=False, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan="1 month", matrix_uuid="my_uuid", entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict( { "entity_id": [0, 2, 3, 4, 4], "as_of_date": [dates[2], dates[1], dates[1], dates[0], dates[1]], "booking": [0, 0, 0, 1, 0], } ).set_index(["entity_id", "as_of_date"]) # the first row would not be here if we had not configured the Builder # to include missing labels as false test = result == df assert test.all().all()
def test_load_labels_data(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns=[ "entity_id", "as_of_date", "label_timespan", "label_name", "label_type", "label", ], ) labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"]) labels_df.set_index(["entity_id", "as_of_date"]) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan="1 month", matrix_uuid="my_uuid", entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict( { "entity_id": [2, 3, 4, 4], "as_of_date": [dates[1], dates[1], dates[0], dates[1]], "booking": [0, 0, 1, 0], } ).set_index(["entity_id", "as_of_date"]) test = result == df assert test.all().all()
def test_load_features_data(): dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make dataframe for entity ids and dates ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, label_name="booking", label_type="binary", label_timespan="1 month", ) features = [["f1", "f2"], ["f3", "f4"]] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ["entity_id", "as_of_date"] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df["as_of_date"] = convert_string_column_to_date(temp_df["as_of_date"]) features_dfs.append( ids_dates.merge( right=temp_df, how="left", on=["entity_id", "as_of_date"] ).set_index(["entity_id", "as_of_date"]) ) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) feature_dictionary = dict( ("features{}".format(i), feature_list) for i, feature_list in enumerate(features) ) returned_features_dfs = builder.load_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid="my_uuid", ) # get the queries and test them for result, df in zip(returned_features_dfs, features_dfs): test = result == df assert test.all().all()
def test_make_entity_date_table_include_missing_labels(): """ Test that the make_entity_date_table function contains the correct values. """ dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0), ] # same as the other make_entity_date_label test except there is an extra date, 2016-06-01 # entity 0 is included in this date via the states table, but has no label # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, label_name="booking", label_type="binary", label_timespan="1 month", ) # this line adds the new entity-date combo as an expected one ids_dates = ids_dates.append( {"entity_id": 0, "as_of_date": datetime.date(2016, 6, 1)}, ignore_index=True ) with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, include_missing_labels_in_train_as=False, engine=engine, ) engine.execute("CREATE TABLE features.tmp_entity_date (a int, b date);") # call the function to test the creation of the table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_uuid="my_uuid", matrix_type="train", label_timespan="1 month", ) # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date".format( entity_date_table_name ), engine, ) # compare the table to the test dataframe assert sorted(result.values.tolist()) == sorted(ids_dates.values.tolist())
def test_load_features_data(): dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make dataframe for entity ids and dates ids_dates = create_entity_date_df(labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_timespan='1 month') features = [['f1', 'f2'], ['f3', 'f4']] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ['entity_id', 'as_of_date'] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df['as_of_date'] = convert_string_column_to_date( temp_df['as_of_date']) features_dfs.append( ids_dates.merge(right=temp_df, how='left', on=['entity_id', 'as_of_date' ]).set_index(['entity_id', 'as_of_date'])) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') feature_dictionary = dict( ('features{}'.format(i), feature_list) for i, feature_list in enumerate(features)) returned_features_dfs = builder.load_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid='my_uuid') # get the queries and test them for result, df in zip(returned_features_dfs, features_dfs): test = (result == df) assert (test.all().all())