def test_training_window(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) # make sure features that have a direct to a higher level agg # so we have multiple "filter eids" in get_pandas_data_slice, # and we go through the loop to pull data with a training_window param more than once dagg = DirectFeature(top_level_agg, entityset['customers']) # for now, warns if last_time_index not present times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]}) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') entityset.add_last_time_indexes() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time, training_window=Timedelta(2, 'observations', entity='log')) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') prop_values = [5, 5, 1] dagg_values = [3, 2, 1] assert (feature_matrix[property_feature.get_name()] == prop_values).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_saveprogress(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 save_progress = tempfile.mkdtemp() fm_save = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time, save_progress=save_progress) _, _, files = next(os.walk(save_progress)) files = [os.path.join(save_progress, file) for file in files] # there is 17 datetime files created above assert len(files) == 17 list_df = [] for file_ in files: df = pd.read_csv(file_, index_col="id", header=0) list_df.append(df) merged_df = pd.concat(list_df) merged_df.set_index(pd.DatetimeIndex(times), inplace=True, append=True) fm_no_save = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time) assert np.all((merged_df.sort_index().values) == (fm_save.sort_index().values)) assert np.all((fm_no_save.sort_index().values) == (fm_save.sort_index().values)) assert np.all((fm_no_save.sort_index().values) == (merged_df.sort_index().values)) shutil.rmtree(save_progress)
def test_cutoff_time_extra_columns(entityset): es = entityset agg_feat = Count(es['customers']['id'], es[u'régions']) dfeat = DirectFeature(agg_feat, es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], 'instance_id': [0, 1, 0], 'label': [True, True, False]}, columns=['time', 'instance_id', 'label']) fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df) # check column was added to end of matrix assert 'label' == fm.columns[-1] # check column was sorted by time labelike the rest of the feature matrix true_series = pd.Series([False, True, True], index=[0, 1, 0]) assert (fm['label'] == true_series).all() fm_2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, approximate="2 days") # check column was added to end of matrix assert 'label' in fm_2.columns # check column was sorted by time like the rest of the feature matrix true_series = pd.Series([False, True, True], index=[0, 1, 0]) assert (fm_2['label'] == true_series).all()
def test_string_time_values_in_cutoff_time(entityset): times = ['2011-04-09 10:31:27', '2011-04-09 10:30:18'] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 0]}) agg_feature = Sum(entityset['log']['value'], entityset['customers']) with pytest.raises(TypeError): calculate_feature_matrix([agg_feature], entityset, cutoff_time=cutoff_time)
def test_cfm_no_cutoff_time_index(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat4 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat4, es['sessions']) cutoff_time = pd.DataFrame({ 'time': [datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)], 'instance_id': [0, 2] }) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, cutoff_time_in_index=False, approximate=Timedelta(12, 's'), cutoff_time=cutoff_time) assert feature_matrix.index.name == 'id' assert feature_matrix.index.values.tolist() == [0, 2] assert feature_matrix[dfeat.get_name()].tolist() == [10, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] cutoff_time = pd.DataFrame({ 'time': [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)], 'instance_id': [0, 2] }) feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat], entityset, cutoff_time_in_index=False, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) assert feature_matrix_2.index.name == 'id' assert feature_matrix_2.index.tolist() == [0, 2] assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_dask_persisted_entityset(entityset, capsys): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 with cluster() as (scheduler, [a, b]): dkwargs = {'cluster': scheduler['address']} feature_matrix = calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, dask_kwargs=dkwargs, approximate='1 hour') assert (feature_matrix == labels).values.all() feature_matrix = calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, dask_kwargs=dkwargs, approximate='1 hour') captured = capsys.readouterr() assert "Using EntitySet persisted on the cluster as dataset " in captured[0] assert (feature_matrix == labels).values.all()
def test_empty_child_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({"id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2]}) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship(ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = Count(es["child"]['id'], es["parent"]) # create agg feature that requires multiple arguments trend = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"]) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = Count(es["child"]['id'], es["parent"], where=where) trend_where = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"], where=where) # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017")) names = [count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name()] assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]]) # cutoff time after all rows, but where clause filters all rows fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018")) names = [count_where.get_name(), trend_where.get_name()] assert_array_equal(fm2[names], [[0, np.nan]])
def test_approximate_time_split_returns_the_same_result(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:07:30'), pd.Timestamp('2011-04-09 10:07:40')], 'instance_id': [0, 0]}) feature_matrix_at_once = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) divided_matrices = [] separate_cutoff = [cutoff_df.iloc[0:1], cutoff_df.iloc[1:]] # Make sure indexes are different # Not that this step is unecessary and done to showcase the issue here separate_cutoff[0].index = [0] separate_cutoff[1].index = [1] for ct in separate_cutoff: fm = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=ct) divided_matrices.append(fm) feature_matrix_from_split = pd.concat(divided_matrices) assert feature_matrix_from_split.shape == feature_matrix_at_once.shape for i1, i2 in zip(feature_matrix_at_once.index, feature_matrix_from_split.index): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2) for c in feature_matrix_from_split: for i1, i2 in zip(feature_matrix_at_once[c], feature_matrix_from_split[c]): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_integer_time_index_datetime_cutoffs(int_es): times = [datetime.now()] * 17 cutoff_df = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(int_es['log']['value']) > 10 with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df, cutoff_time_in_index=True)
def test_cfm_returns_original_time_indexes(entityset): es = entityset agg_feat = Count(es['customers']['id'], es[u'régions']) dfeat = DirectFeature(agg_feat, es['customers']) agg_feat_2 = Count(es['sessions']['id'], es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], 'instance_id': [0, 1, 0]}) sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort') # no approximate fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True) instance_level_vals = fm.index.get_level_values(0).values time_level_vals = fm.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows, no unapproximated aggs fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows, unapproximated aggs fm2 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window, no unapproximated aggs fm3 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window, unapproximated aggs fm3 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all()
def test_datetime_index_mixed_cutoff(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [17] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True] instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14] cutoff_df = pd.DataFrame({'time': times, 'instance_id': instances, 'labels': labels}) cutoff_df = cutoff_df[['time', 'instance_id', 'labels']] property_feature = IdentityFeature(entityset['log']['value']) > 10 with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times[9] = "foobar" cutoff_df['time'] = times with pytest.raises(ValueError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) cutoff_df['time'].iloc[9] = '2018-04-02 18:50:45.453216' with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times[9] = '17' cutoff_df['time'] = times with pytest.raises(ValueError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)
def test_integer_time_index_mixed_cutoff(int_es): times_dt = list(range(8, 17)) + [datetime(2011, 1, 1), 19, 20, 21, 22, 25, 24, 23] labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True] instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14] cutoff_df = pd.DataFrame({'time': times_dt, 'instance_id': instances, 'labels': labels}) cutoff_df = cutoff_df[['time', 'instance_id', 'labels']] property_feature = IdentityFeature(int_es['log']['value']) > 10 with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times_str = list(range(8, 17)) + ["foobar", 19, 20, 21, 22, 25, 24, 23] cutoff_df['time'] = times_str with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) times_date_str = list(range(8, 17)) + ['2018-04-02', 19, 20, 21, 22, 25, 24, 23] cutoff_df['time'] = times_date_str with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df) [19, 20, 21, 22] times_int_str = [0, 1, 2, 3, 4, 5, '6', 7, 8, 9, 9, 10, 11, 12, 15, 14, 13] times_int_str = list(range(8, 17)) + ['17', 19, 20, 21, 22, 25, 24, 23] cutoff_df['time'] = times_int_str with pytest.raises(TypeError): calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)
def test_uses_full_entity_feat_of_approximate(entityset): es = entityset agg_feat = Sum(es['log']['value'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) agg_feat3 = Min(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) dfeat2 = DirectFeature(agg_feat3, es['sessions']) p = Percentile(dfeat) # only dfeat2 should be approximated # because Percentile needs all values feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0] feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == feature_matrix_approx[dfeat2.get_name()].tolist() feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 'ms'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) for f in [p, dfeat, agg_feat]: for fm1, fm2 in combinations([feature_matrix_approx, feature_matrix_small_approx, feature_matrix_no_approx], 2): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=cutoff_time) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_custom_primitive_time_as_arg(es): def time_since_last(values, time): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=True) assert TimeSinceLast.name == "time_since_last" f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = ft.calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def test_cutoff_time_correctly(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) feature_matrix = calculate_feature_matrix([property_feature], instance_ids=[0, 1, 2], cutoff_time=[datetime(2011, 4, 10), datetime(2011, 4, 11), datetime(2011, 4, 7)]) labels = [0, 10, 5] assert (feature_matrix == labels).values.all()
def test_time_since_last_custom(es): def time_since_last(values, time=None): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, name="time_since_last", uses_calc_time=True) f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) with pytest.raises(ValueError): TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def test_encode_features_handles_pass_columns(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) f2 = IdentityFeature(entityset["log"]["value"]) features = [f1, f2] cutoff_time = pd.DataFrame({'instance_id': range(6), 'time': entityset['log'].df['datetime'][0:6], 'label': [i % 2 for i in range(6)]}, columns=["instance_id", "time", "label"]) feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time) assert 'label' in feature_matrix.columns feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) feature_matrix_encoded_shape = feature_matrix_encoded.shape # to_encode should keep product_id as a string, and not create 3 additional columns to_encode = [] feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape to_encode = ['value'] feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape assert 'label' in feature_matrix_encoded.columns
def test_median(es): f = Median(es["log"]["value_many_nans"], es["customers"]) fm = calculate_feature_matrix([f], instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [1, 3, np.nan] np.testing.assert_equal(fm[f.get_name()].values, correct)
def test_approximate_child_aggs_handled_correctly(entityset): es = entityset agg_feat = Count(es['customers']['id'], es['regions']) dfeat = DirectFeature(agg_feat, es['customers']) agg_feat_2 = Count(es['log']['value'], es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'), pd.Timestamp('2011-04-09 10:30:06')], 'instance_id': [0, 0]}) fm = calculate_feature_matrix([dfeat], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) fm_2 = calculate_feature_matrix([dfeat, agg_feat_2], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) assert fm[dfeat.get_name()].tolist() == [2, 3] assert fm_2[agg_feat_2.get_name()].tolist() == [0, 2]
def test_cfm_approximate_correct_ordering(): trips = { 'trip_id': [i for i in range(1000)], 'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)], 'flight_id': [randint(1, 25) for i in range(1000)], 'trip_duration': [randint(1, 999) for i in range(1000)] } df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) features = dfs(entityset=es, target_entity='trips', features_only=True) flight_features = [feature for feature in features if isinstance(feature, DirectFeature) and isinstance(feature.base_features[0], AggregationPrimitive)] property_feature = IdentityFeature(es['trips']['trip_id']) # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'], # es['flights']), # es['trips']) cutoff_time = pd.DataFrame.from_dict({'instance_id': df['trip_id'], 'time': df['flight_time']}) time_feature = IdentityFeature(es['trips']['flight_time']) feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature], cutoff_time_in_index=True, cutoff_time=cutoff_time) feature_matrix.index.names = ['instance', 'time'] assert(np.all(feature_matrix.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix[['trip_id', 'flight_time']].values)) feature_matrix_2 = calculate_feature_matrix(flight_features + [property_feature, time_feature], cutoff_time=cutoff_time, cutoff_time_in_index=True, approximate=Timedelta(2, 'd')) feature_matrix_2.index.names = ['instance', 'time'] assert(np.all(feature_matrix_2.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix_2[['trip_id', 'flight_time']].values)) for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)): import pdb pdb.set_trace() assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_time_since_last(es): f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = calculate_feature_matrix([f], instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct)
def test_parallel_failure_raises_correct_error(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 with pytest.raises(AssertionError): calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, n_jobs=0, approximate='1 hour')
def test_cutoff_time_correctly(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) times = [datetime(2011, 4, 10), datetime(2011, 4, 11), datetime(2011, 4, 7)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]}) feature_matrix = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time) labels = [0, 10, 5] assert (feature_matrix == labels).values.all()
def head(self, n=10, cutoff_time=None): """See values for feature Args: n (int) : number of instances to return Returns: :class:`pd.DataFrame` : Pandas DataFrame """ from featuretools import calculate_feature_matrix cfm = calculate_feature_matrix([self], cutoff_time=cutoff_time).head(n) return cfm
def test_cutoff_time_naming(entityset): es = entityset agg_feat = Count(es['customers']['id'], es[u'régions']) dfeat = DirectFeature(agg_feat, es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'), pd.Timestamp('2011-04-09 10:30:06')], 'instance_id': [0, 0]}) cutoff_df_index_name = cutoff_df.rename(columns={"instance_id": "id"}) cutoff_df_time_name = cutoff_df.rename(columns={"time": "cutoff_time"}) cutoff_df_index_name_time_name = cutoff_df.rename(columns={"instance_id": "id", "time": "cutoff_time"}) cutoff_df_wrong_index_name = cutoff_df.rename(columns={"instance_id": "wrong_id"}) fm1 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df) for test_cutoff in [cutoff_df_index_name, cutoff_df_time_name, cutoff_df_index_name_time_name]: fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=test_cutoff) assert all((fm1 == fm2.values).values) with pytest.raises(AttributeError): calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df_wrong_index_name)
def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['log']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]
def test_calc_feature_matrix(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 property_feature = IdentityFeature(entityset['log']['value']) > 10 feature_matrix = calculate_feature_matrix([property_feature], instance_ids=range(17), cutoff_time=times, verbose=True) assert (feature_matrix == labels).values.all() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix('features', instance_ids=range(17), cutoff_time=times) with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([], instance_ids=range(17), cutoff_time=times) with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([1, 2, 3], instance_ids=range(17), cutoff_time=times) with pytest.raises(TypeError): calculate_feature_matrix([property_feature], instance_ids=range(17), cutoff_time=17)
def test_approximate_returns_correct_empty_default_values(entityset): es = entityset agg_feat = Count(es['log']['id'], es['customers']) dfeat = DirectFeature(agg_feat, es['sessions']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 11:00:00'), pd.Timestamp('2011-04-09 11:00:00')], 'instance_id': [0, 0]}) fm = calculate_feature_matrix([dfeat], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) assert fm[dfeat.get_name()].tolist() == [0, 10]
def test_inplace_encodes_features(entityset): f1 = IdentityFeature(entityset["log"]["product_id"]) features = [f1] feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5]) feature_matrix_shape = feature_matrix.shape feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) assert feature_matrix_encoded.shape != feature_matrix_shape assert feature_matrix.shape == feature_matrix_shape # inplace they should be the same feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True) assert feature_matrix_encoded.shape == feature_matrix.shape
def calculate_feature_matrix(self, X, target_entity=None, entityset=None, entities=None, relationships=None): if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) if self.training_window is not None: entityset.add_last_time_indexes() cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] cutoff_time = cutoff_time.rename(columns={self.time_index: 'time'}) X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) return X
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self): where_str = self._where_str() use_prev_str = self._use_prev_str() return u"COUNT(%s%s%s)" % (self.child_entity.name, where_str, use_prev_str) Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = Count(es['log']['value'], es['sessions'], count_null=True) feature_matrix = calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: if self._features is None: raise ValueError('Must call fit() before calling produce()') if not isinstance(inputs, Dataset): raise ValueError('Inputs to produce() must be a Dataset') features = self._features parsed = self._parse_inputs( inputs, entities_to_normalize=self._entities_normalized, # original_entityset=self._entityset, parse_target=False) entityset = parsed['entityset'] target = self._target instance_ids = parsed['instance_ids'] feature_matrix = ft.calculate_feature_matrix( features, entityset=entityset, instance_ids=instance_ids, cutoff_time_in_index=False) fm_with_metadata = self._format_fm_after_cfm(feature_matrix, instance_ids, features, target, entityset, inputs.metadata) return CallResult(fm_with_metadata)
def test_arithmetic_of_identity(es): logs = es['log'] to_test = [(AddNumeric, [0., 7., 14., 21.]), (SubtractNumeric, [0, 3, 6, 9]), (MultiplyNumeric, [0, 10, 40, 90]), (DivideNumeric, [np.nan, 2.5, 2.5, 2.5])] features = [] for test in to_test: features.append( ft.Feature([logs['value'], logs['value_2']], primitive=test[0])) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2, 3]) for i, test in enumerate(to_test[:-1]): v = df[features[i].get_name()].values.tolist() assert v == test[1] i, test = 3, to_test[-1] v = df[features[i].get_name()].values.tolist() assert (np.isnan(v[0])) assert v[1:] == test[1][1:]
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") es = self._make_entityset(inputs.copy()) fm = ft.calculate_feature_matrix( entityset=es, features=self.features, chunk_size=self.chunk_size ) # make sure the feature matrix is ordered the same as the input fm = fm.reindex(es[self._target_resource_id].df.index) fm = fm.reset_index(drop=True) # d3m wants index to increment by 1 # treat inf as null like fit step fm = fm.replace([np.inf, -np.inf], np.nan) # todo add this metadata handle fm = add_metadata(fm, self.features) fm = self._add_labels(fm, inputs) return CallResult(fm)
def test_compare_of_agg(es): count_logs = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) to_test = [(EqualScalar, [False, False, False, True]), (NotEqualScalar, [True, True, True, False]), (LessThanScalar, [False, False, True, False]), (LessThanEqualToScalar, [False, False, True, True]), (GreaterThanScalar, [True, True, False, False]), (GreaterThanEqualToScalar, [True, True, False, True])] features = [] for test in to_test: features.append(ft.Feature(count_logs, primitive=test[0](2))) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2, 3]) df = to_pandas(df, index='id', sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_arithmetic_of_val(es): to_test = [(AddNumericScalar, [2.0, 7.0, 12.0, 17.0]), (SubtractNumericScalar, [-2.0, 3.0, 8.0, 13.0]), (ScalarSubtractNumericFeature, [2.0, -3.0, -8.0, -13.0]), (MultiplyNumericScalar, [0, 10, 20, 30]), (DivideNumericScalar, [0, 2.5, 5, 7.5]), (DivideByFeature, [np.inf, 0.4, 0.2, 2 / 15.0])] features = [] for test in to_test: features.append(ft.Feature(es['log']['value'], primitive=test[0](2))) features.append(ft.Feature(es['log']['value']) / 0) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2, 3]) for f, test in zip(features, to_test): v = df[f.get_name()].values.tolist() assert v == test[1] test = [np.nan, np.inf, np.inf, np.inf] v = df[features[-1].get_name()].values.tolist() assert (np.isnan(v[0])) assert v[1:] == test[1:]
def test_get_filepath(es): class Mod4(TransformPrimitive): '''Return base feature modulo 4''' name = "mod4" input_types = [Numeric] return_type = Numeric def get_function(self): filepath = self.get_filepath("featuretools_unit_test_example.csv") reference = pd.read_csv(filepath, header=None, squeeze=True) def map_to_word(x): def _map(x): if pd.isnull(x): return x return reference[int(x) % 4] return pd.Series(x).apply(_map) return map_to_word feat = ft.Feature(es['log']['value'], primitive=Mod4) df = ft.calculate_feature_matrix(features=[feat], entityset=es, instance_ids=range(17)) assert pd.isnull(df["MOD4(value)"][15]) assert df["MOD4(value)"][0] == 0 assert df["MOD4(value)"][14] == 2 fm, fl = ft.dfs(entityset=es, target_entity="log", agg_primitives=[], trans_primitives=[Mod4]) assert fm["MOD4(value)"][0] == 0 assert fm["MOD4(value)"][14] == 2 assert pd.isnull(fm["MOD4(value)"][15])
def test_boolean_multiply(boolean_mult_es): es = boolean_mult_es to_test = [('numeric', 'numeric'), ('numeric', 'bool'), ('bool', 'numeric'), ('bool', 'bool')] features = [] for row in to_test: features.append( ft.Feature(es["test"][row[0]]) * ft.Feature(es["test"][row[1]])) fm = ft.calculate_feature_matrix(entityset=es, features=features) if isinstance(fm, dd.DataFrame): fm = fm.compute() df = es['test'].df if isinstance(df, dd.DataFrame): df = df.compute() for row in to_test: col_name = '{} * {}'.format(row[0], row[1]) if row[0] == 'bool' and row[1] == 'bool': assert fm[col_name].equals(df[row[0]] & df[row[1]]) else: assert fm[col_name].equals(df[row[0]] * df[row[1]])
def test_binary_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() enc = Encoder(method='binary') fm_encoded = enc.fit_transform(feature_matrix, features) encoder = BinaryEnc(fitted_encoder=enc, category='product_id') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [[0, 0, 0, 0], [1, 1, 0, 0], [0, 1, 1, 1]] assert (encoded == encoded_results).all() product_feature = ft.Feature([f1], primitive=BinaryEnc(enc, 0)) cc_feature = ft.Feature([f4], primitive=BinaryEnc(enc, 1)) features = [product_feature, f2, f3, cc_feature] assert len(features) == len(enc.get_features()) # __eq__ does not support multioutput columns yet for i in range(len(enc.get_features())): assert features[i].unique_name() == enc.get_features()[i].unique_name() features = enc.get_features() feature_matrix = ft.calculate_feature_matrix(features, es, instance_ids=ids) assert (fm_encoded == feature_matrix).all().all()
def test_text_primitives(es): words = ft.Feature(es['log']['comments'], primitive=NumWords) chars = ft.Feature(es['log']['comments'], primitive=NumCharacters) features = [words, chars] df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) word_counts = [ 514, 3, 3, 644, 1268, 1269, 177, 172, 79, 240, 1239, 3, 3, 3, 3 ] char_counts = [ 3392, 10, 10, 4116, 7961, 7580, 992, 957, 437, 1325, 6322, 10, 10, 10, 10 ] word_values = df[words.get_name()].values char_values = df[chars.get_name()].values assert len(word_values) == 15 for i, v in enumerate(word_values): assert v == word_counts[i] for i, v in enumerate(char_values): assert v == char_counts[i]
def test_calc_feature_matrix(entityset): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 property_feature = IdentityFeature(entityset['log']['value']) > 10 feature_matrix = calculate_feature_matrix([property_feature], entityset, instance_ids=range(17), cutoff_time=times, verbose=True) assert (feature_matrix == labels).values.all() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix('features', entityset, instance_ids=range(17), cutoff_time=times) with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([], entityset, instance_ids=range(17), cutoff_time=times) with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([1, 2, 3], entityset, instance_ids=range(17), cutoff_time=times) with pytest.raises(TypeError): calculate_feature_matrix([property_feature], entityset, instance_ids=range(17), cutoff_time=17)
def build_transaction_data(): """ Builds a data set from raw card and transaction data using the featuretools package. The resulting data set will be strictly concerned with transactions shown in the historical transactions CSV, and linking them to the proper card. :return: training, testing feature matrices """ logger = logging.getLogger(__name__) logger.info("Reading in card data") customer_df = pd.read_csv("data/raw/train.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") customer_df.drop(columns='target', inplace=True) logger.info("Reading in transactions") transactions_df = pd.read_csv("data/raw/historical_transactions.csv", dtype=TRANSACTION_LOAD_DTYPES) transactions_df['authorized_flag'] = np.where( transactions_df['authorized_flag'] == 'Y', 1, 0) transactions_df.reset_index(inplace=True) logger.info("Creating training entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) del customer_df gc.collect() logger.info("Defining relationships") relationship = ft.Relationship(es_train['customer']['card_id'], es_train['transactions']['card_id']) es_train = es_train.add_relationship(relationship) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity='customer') train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Loading test data") customer_df = pd.read_csv("data/raw/test.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") logger.info("Creating testing entity set") es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) es_test = es_test.add_relationship(relationship) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) for col in train_feature_matrix_enc.columns: logger.debug(f"Normalizing feature [{col}]") old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max']) if (old_min == old_max): logger.debug(f"Droping feature [{col}] due to lack of variation") train_feature_matrix_enc.drop(columns=col, inplace=True) test_feature_matrix_enc.drop(columns=col, inplace=True) continue train_feature_matrix_enc[col] = normalize_series( series=train_feature_matrix_enc[col], min_max=(old_min, old_max)) assert col in test_feature_matrix_enc.columns test_feature_matrix_enc[col] = normalize_series( series=test_feature_matrix_enc[col], min_max=(old_min, old_max)) logger.info("Dropping SKEW features.") # TODO: Determine why these have lower counts than other features drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c] train_feature_matrix_enc.drop(columns=drop_cols, inplace=True) test_feature_matrix_enc.drop(columns=drop_cols, inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def test_empty_child_dataframe(parent_child): parent_df, child_df = parent_child if not isinstance(parent_df, pd.DataFrame): parent_vtypes = {'id': variable_types.Index} child_vtypes = { 'id': variable_types.Index, 'parent_id': variable_types.Numeric, 'time_index': variable_types.Datetime, 'value': variable_types.Numeric, 'cat': variable_types.Categorical } else: parent_vtypes = None child_vtypes = None es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id", variable_types=parent_vtypes) es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index", variable_types=child_vtypes) es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create multi-output agg feature n_most_common = ft.Feature(es["child"]['cat'], parent_entity=es["parent"], primitive=NMostCommon) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) n_most_common_where = ft.Feature(es["child"]['cat'], parent_entity=es["parent"], where=where, primitive=NMostCommon) if isinstance(parent_df, pd.DataFrame): features = [ count, count_where, trend, trend_where, n_most_common, n_most_common_where ] names = [ count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name(), *n_most_common.get_feature_names(), *n_most_common_where.get_feature_names() ] values = [ 0, 0, np.nan, np.nan, *np.full(n_most_common.number_output_features, np.nan), *np.full(n_most_common_where.number_output_features, np.nan) ] else: features = [count, count_where] names = [count.get_name(), count_where.get_name()] values = [0, 0] # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("12/31/2017")) fm = to_pandas(fm) assert_array_equal(fm[names], [values]) # cutoff time after all rows, but where clause filters all rows if isinstance(parent_df, pd.DataFrame): features = [count_where, trend_where, n_most_common_where] names = [ count_where.get_name(), trend_where.get_name(), *n_most_common_where.get_feature_names() ] values = [ 0, np.nan, *np.full(n_most_common_where.number_output_features, np.nan) ] else: features = [count_where] names = [count_where.get_name()] values = [0] fm2 = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("1/4/2018")) fm2 = to_pandas(fm2) assert_array_equal(fm2[names], [values])
def test_make_agg_feat_where_different_identity_feat(es): feats = [] where_cmps = [ LessThanScalar, GreaterThanScalar, LessThanEqualToScalar, GreaterThanEqualToScalar, EqualScalar, NotEqualScalar, ] for where_cmp in where_cmps: feats.append( ft.Feature( es["log"].ww["id"], parent_dataframe_name="sessions", where=ft.Feature( es["log"].ww["datetime"], primitive=where_cmp(datetime(2011, 4, 10, 10, 40, 1)), ), primitive=Count, )) df = ft.calculate_feature_matrix(entityset=es, features=feats, instance_ids=[0, 1, 2, 3]) df = to_pandas(df, index="id", sort_index=True) for i, where_cmp in enumerate(where_cmps): name = feats[i].get_name() instances = df[name] v0, v1, v2, v3 = instances[0:4] if where_cmp == LessThanScalar: assert v0 == 5 assert v1 == 4 assert v2 == 1 assert v3 == 1 elif where_cmp == GreaterThanScalar: assert v0 == 0 assert v1 == 0 assert v2 == 0 assert v3 == 0 elif where_cmp == LessThanEqualToScalar: assert v0 == 5 assert v1 == 4 assert v2 == 1 assert v3 == 2 elif where_cmp == GreaterThanEqualToScalar: assert v0 == 0 assert v1 == 0 assert v2 == 0 assert v3 == 1 elif where_cmp == EqualScalar: assert v0 == 0 assert v1 == 0 assert v2 == 0 assert v3 == 1 elif where_cmp == NotEqualScalar: assert v0 == 5 assert v1 == 4 assert v2 == 1 assert v3 == 1
def test_handles_primitive_function_name_uniqueness(entityset): class SumTimesN(AggregationPrimitive): name = "sum_times_n" input_types = [Numeric] return_type = Numeric def __init__(self, n): self.n = n def get_function(self): def my_function(values): return values.sum() * self.n return my_function def generate_name(self, base_feature_names, child_entity_id, parent_entity_id, where_str, use_prev_str): base_features_str = ", ".join(base_feature_names) return u"%s(%s.%s%s%s, n=%s)" % (self.name.upper(), child_entity_id, base_features_str, where_str, use_prev_str, self.n) # works as expected f1 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=SumTimesN(n=1)) fm = ft.calculate_feature_matrix(features=[f1], entityset=entityset) value_sum = pd.Series([56, 26, 0]) assert all(fm[f1.get_name()].sort_index() == value_sum) # works as expected f2 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=SumTimesN(n=2)) fm = ft.calculate_feature_matrix(features=[f2], entityset=entityset) double_value_sum = pd.Series([112, 52, 0]) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # same primitive, same variable, different args fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=entityset) assert all(fm[f1.get_name()].sort_index() == value_sum) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # different primtives, same function returned by get_function, # different base features f3 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum) f4 = ft.Feature(entityset["log"]["purchased"], parent_entity=entityset["customers"], primitive=NumTrue) fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=entityset) purchased_sum = pd.Series([10, 1, 1]) assert all(fm[f3.get_name()].sort_index() == value_sum) assert all(fm[f4.get_name()].sort_index() == purchased_sum)\ # different primtives, same function returned by get_function, # same base feature class Sum1(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum1" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self): return np.sum class Sum2(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum2" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self): return np.sum class Sum3(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum3" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self): return np.sum f5 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum1) f6 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum2) f7 = ft.Feature(entityset["log"]["value"], parent_entity=entityset["customers"], primitive=Sum3) fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=entityset) assert all(fm[f5.get_name()].sort_index() == value_sum) assert all(fm[f6.get_name()].sort_index() == value_sum) assert all(fm[f7.get_name()].sort_index() == value_sum)
def test_cum_sum_numpy_group_on_nan(pd_es): class CumSumNumpy(TransformPrimitive): """Returns the cumulative sum after grouping""" name = "cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) uses_full_dataframe = True def get_function(self): def cum_sum(values): return values.cumsum().values return cum_sum log_value_feat = ft.IdentityFeature(pd_es["log"].ww["value"]) pd_es["log"]["product_id"] = ( ["coke zero"] * 3 + ["car"] * 2 + ["toothpaste"] * 3 + ["brown bag"] * 2 + ["shoes"] + [np.nan] * 4 + ["coke_zero"] * 2 ) pd_es["log"]["value"][16] = 10 cum_sum = ft.Feature( log_value_feat, groupby=ft.IdentityFeature(pd_es["log"].ww["product_id"]), primitive=CumSumNumpy, ) assert cum_sum.get_name() == "CUM_SUM(value) by product_id" features = [cum_sum] df = ft.calculate_feature_matrix( entityset=pd_es, features=features, instance_ids=range(17) ) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 17 cum_sum_values = [ 0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan, np.nan, 10, ] assert len(cvalues) == len(cum_sum_values) for i, v in enumerate(cum_sum_values): if np.isnan(v): assert np.isnan(cvalues[i]) else: assert v == cvalues[i]
def test_uses_full_entity_feat_of_approximate(entityset): es = entityset agg_feat = Sum(es['log']['value'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) agg_feat3 = Min(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) dfeat2 = DirectFeature(agg_feat3, es['sessions']) p = Percentile(dfeat) # only dfeat2 should be approximated # because Percentile needs all values feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0] feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist( ) == feature_matrix_approx[dfeat2.get_name()].tolist() feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 'ms'), cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=True, cutoff_time=[ datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0) ]) for f in [p, dfeat, agg_feat]: for fm1, fm2 in combinations([ feature_matrix_approx, feature_matrix_small_approx, feature_matrix_no_approx ], 2): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # TODO: Update to work with Dask and Spark if es.dataframe_type != Library.PANDAS.value: pytest.xfail("Need to update to work with Dask and Spark EntitySets") # test with normally defined functions class Sum(AggregationPrimitive): name = "sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) def get_function(self): def custom_primitive(x): return x.sum() return custom_primitive class Max(AggregationPrimitive): name = "max" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) def get_function(self): def custom_primitive(x): return x.max() return custom_primitive f_sum = ft.Feature( es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum ) f_max = ft.Feature( es["log"].ww["value"], parent_dataframe_name="customers", primitive=Max ) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas class Sum(AggregationPrimitive): name = "sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) def get_function(self): return lambda x: x.sum() class Max(AggregationPrimitive): name = "max" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) def get_function(self): return lambda x: x.max() f_sum = ft.Feature( es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum ) f_max = ft.Feature( es["log"].ww["value"], parent_dataframe_name="customers", primitive=Max ) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def compute_features(features, cutoff_time): feature_matrix = ft.calculate_feature_matrix(features, cutoff_time=cutoff_time, approximate='36d') return feature_matrix
def test_cfm_returns_original_time_indexes(entityset): es = entityset agg_feat = Count(es['customers']['id'], es[u'régions']) dfeat = DirectFeature(agg_feat, es['customers']) agg_feat_2 = Count(es['sessions']['id'], es['customers']) cutoff_df = pd.DataFrame({ 'time': [ pd.Timestamp('2011-04-09 10:30:06'), pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00') ], 'instance_id': [0, 1, 0] }) sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort') # no approximate fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True) instance_level_vals = fm.index.get_level_values(0).values time_level_vals = fm.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows, no unapproximated aggs fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows, unapproximated aggs fm2 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window, no unapproximated aggs fm3 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window, unapproximated aggs fm3 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all()
def run_featuretools(self, read_in_data_if_needed=True, export_to_csv=False): # TODO: This should eventually be dynamic. dataset_filenames = ['POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv',\ 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv'] if self.datasets == []: self.read_all_data(dataset_filenames=dataset_filenames) for data in self.datasets: if data.name == 'POS_CASH_balance': pos = data.data elif data.name == 'application_test': test = data.data elif data.name == 'application_train': train_full = data.data elif data.name == 'bureau': bureau = data.data elif data.name == 'bureau_balance': bureau_balance = data.data elif data.name == 'credit_card_balance': cc_bal = data.data elif data.name == 'installments_payments': inst = data.data elif data.name == 'previous_application': prev_app = data.data train = train_full.drop('TARGET', axis=1) train_y = train_full['TARGET'] print('Creating entity set.') # Create new entityset es = ft.EntitySet(id='train') print('Creating train entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='train', dataframe=train, index='SK_ID_CURR') print('Creating bureau entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') print('Creating bureau_bal entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='bureau_bal', dataframe=bureau_balance, make_index=True, index='bureau_bal_id') print('Creating pos entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='pos', dataframe=pos, make_index=True, index='pos_id') print('Creating cc_bal entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='cc_bal', dataframe=cc_bal, make_index=True, index='cc_bal_id') print('Creating inst entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='inst', dataframe=inst, make_index=True, index='inst_id') print('Creating prev_app entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='prev_app', dataframe=prev_app, index='SK_ID_PREV') print('Creating relationships.') print(str(pd.Timestamp.now())) # Create relationships print('Creating r_train_bureau.') print(str(pd.Timestamp.now())) r_train_bureau = ft.Relationship(es['train']['SK_ID_CURR'], es['bureau']['SK_ID_CURR']) es = es.add_relationship(r_train_bureau) print('Creating r_bureau_bureau_bal.') print(str(pd.Timestamp.now())) r_bureau_bureau_bal = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_bal']['SK_ID_BUREAU']) es = es.add_relationship(r_bureau_bureau_bal) print('Creating r_train_pos.') print(str(pd.Timestamp.now())) r_train_pos = ft.Relationship(es['train']['SK_ID_CURR'], es['pos']['SK_ID_CURR']) es = es.add_relationship(r_train_pos) print('Creating r_train_cc_bal.') print(str(pd.Timestamp.now())) r_train_cc_bal = ft.Relationship(es['train']['SK_ID_CURR'], es['cc_bal']['SK_ID_CURR']) es = es.add_relationship(r_train_cc_bal) print('Creating r_train_inst.') print(str(pd.Timestamp.now())) r_train_inst = ft.Relationship(es['train']['SK_ID_CURR'], es['inst']['SK_ID_CURR']) es = es.add_relationship(r_train_inst) print('Creating r_train_prev_app.') print(str(pd.Timestamp.now())) r_train_prev_app = ft.Relationship(es['train']['SK_ID_CURR'], es['prev_app']['SK_ID_CURR']) es = es.add_relationship(r_train_prev_app) print('Creating r_prev_app_pos.') print(str(pd.Timestamp.now())) r_prev_app_pos = ft.Relationship(es['prev_app']['SK_ID_PREV'], es['pos']['SK_ID_PREV']) es = es.add_relationship(r_prev_app_pos) print('Creating r_prev_app_inst.') print(str(pd.Timestamp.now())) r_prev_app_inst = ft.Relationship(es['prev_app']['SK_ID_PREV'], es['inst']['SK_ID_PREV']) es = es.add_relationship(r_prev_app_inst) print('Creating r_prev_app_cc_bal.') print(str(pd.Timestamp.now())) r_prev_app_cc_bal = ft.Relationship(es['prev_app']['SK_ID_PREV'], es['cc_bal']['SK_ID_PREV']) es = es.add_relationship(r_prev_app_cc_bal) # Create new features using specified primitives # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html print('Creating actual features.') print(str(pd.Timestamp.now())) feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'train', \ agg_primitives = ['mean', 'max', 'last'] # trans_primitives = ['years', 'month', 'subtract', 'divide'] ) self.featuretools_feature_set = feature_matrix self.featuretools_feature_names = feature_defs # One hot encode categorical features feature_matrix_enc, feature_defs_enc = ft.encode_features( feature_matrix, feature_defs) # Create entity set for test print('Creating test entity') ts = ft.EntitySet(id='test') print('Creating test entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='test', dataframe=test, index='SK_ID_CURR') print('Creating bureau entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') print('Creating bureau_bal entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='bureau_bal', dataframe=bureau_balance, make_index=True, index='bureau_bal_id') print('Creating pos entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='pos', dataframe=pos, make_index=True, index='pos_id') print('Creating cc_bal entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='cc_bal', dataframe=cc_bal, make_index=True, index='cc_bal_id') print('Creating inst entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='inst', dataframe=inst, make_index=True, index='inst_id') print('Creating prev_app entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='prev_app', dataframe=prev_app, index='SK_ID_PREV') print('Creating relationships.') print(str(pd.Timestamp.now())) # Create relationships print('Creating r_test_bureau.') print(str(pd.Timestamp.now())) r_test_bureau = ft.Relationship(ts['test']['SK_ID_CURR'], ts['bureau']['SK_ID_CURR']) ts = ts.add_relationship(r_test_bureau) print('Creating r_bureau_bureau_bal.') print(str(pd.Timestamp.now())) r_bureau_bureau_bal = ft.Relationship(ts['bureau']['SK_ID_BUREAU'], ts['bureau_bal']['SK_ID_BUREAU']) ts = ts.add_relationship(r_bureau_bureau_bal) print('Creating r_test_pos.') print(str(pd.Timestamp.now())) r_test_pos = ft.Relationship(ts['test']['SK_ID_CURR'], ts['pos']['SK_ID_CURR']) ts = ts.add_relationship(r_test_pos) print('Creating r_test_cc_bal.') print(str(pd.Timestamp.now())) r_test_cc_bal = ft.Relationship(ts['test']['SK_ID_CURR'], ts['cc_bal']['SK_ID_CURR']) ts = ts.add_relationship(r_test_cc_bal) print('Creating r_test_inst.') print(str(pd.Timestamp.now())) r_test_inst = ft.Relationship(ts['test']['SK_ID_CURR'], ts['inst']['SK_ID_CURR']) ts = ts.add_relationship(r_test_inst) print('Creating r_test_prev_app.') print(str(pd.Timestamp.now())) r_test_prev_app = ft.Relationship(ts['test']['SK_ID_CURR'], ts['prev_app']['SK_ID_CURR']) ts = ts.add_relationship(r_test_prev_app) print('Creating r_prev_app_pos.') print(str(pd.Timestamp.now())) r_prev_app_pos = ft.Relationship(ts['prev_app']['SK_ID_PREV'], ts['pos']['SK_ID_PREV']) ts = ts.add_relationship(r_prev_app_pos) print('Creating r_prev_app_inst.') print(str(pd.Timestamp.now())) r_prev_app_inst = ft.Relationship(ts['prev_app']['SK_ID_PREV'], ts['inst']['SK_ID_PREV']) ts = ts.add_relationship(r_prev_app_inst) print('Creating r_prev_app_cc_bal.') print(str(pd.Timestamp.now())) r_prev_app_cc_bal = ft.Relationship(ts['prev_app']['SK_ID_PREV'], ts['cc_bal']['SK_ID_PREV']) ts = ts.add_relationship(r_prev_app_cc_bal) # Create new features using specified primitives # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html print('Creating actual features.') print(str(pd.Timestamp.now())) feature_matrix_test = ft.calculate_feature_matrix( features=feature_matrix_enc, entityset='test') # One hot encode categorical features feature_matrix_test_enc, feature_defs_test_enc = ft.encode_features( feature_matrix_test, feature_defs) print('Done running featuretools!') print('Exporting features to CSV.') if export_to_csv: pd.DataFrame(feature_matrix_enc).to_csv('featuretools_feature.csv') train_y.to_csv('train_y.csv') pd.DataFrame(feature_matrix_test_enc).to_csv( 'featuretools_features_test.csv')
def test_empty_child_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({ "id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2] }) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) # cutoff time before all rows fm = ft.calculate_feature_matrix( entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017")) names = [ count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name() ] assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]]) # cutoff time after all rows, but where clause filters all rows fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018")) names = [count_where.get_name(), trend_where.get_name()] assert_array_equal(fm2[names], [[0, np.nan]])
def gen_feature_matrix(entityset, features_only=False, feature_matrix_encode=False, saved_features=None): '''A function compute and return (feature_matrix, feature_defs) from an featuretools EntitySet entityset: the EntitySet to compute features from features_only: only return feature_defs, do not actually compute the feature_matrix feature_matrix_encode: whether return encoded feature_matrix (Categorical variable one-hot) saved_features: load a pre defined feature file and compute feature_matrix based on it ''' if 'goldstandard' in entityset.entity_dict.keys(): goldstandard_exist = True goldstandard_id = 'goldstandard' else: goldstandard_exist = False goldstandard_id = None ##FIX manual partition by person_id does NOT improve Dask computing performance # ignore 'partition' columns in every entity when building features # ignore_variables = dict() # for entity in entityset.entities: # if 'partition' in [v.name for v in entity.variables]: # ignore_variables[entity.id] = ['partition'] ##CAUTION when the entityset is backed by Dask dataframes, only limited set of primitives are supported # agg_primitives_all=['avg_time_between', 'count', 'all', 'entropy', 'last', 'num_unique', 'n_most_common', # 'min', 'std', 'median', 'mean', 'percent_true', 'trend', 'sum', 'time_since_last', 'any', # 'num_true', 'time_since_first', 'first', 'max', 'mode', 'skew'] # agg_primitives_dask=['count', 'all', 'num_unique', #'n_most_common', # 'min', 'std', 'mean', 'percent_true', 'sum', 'any', # 'num_true', 'max'] ## define features per entity(table) agg_primitives = [ 'mean', 'max', 'min', 'std', 'last', 'skew', 'time_since_last' ] # 'trend' # trend takes extremely long time to compute include_variables = { 'measurement': ['measurement_datetime', 'value_as_number', 'measurement_concept_id'], 'observation': ['observation_concept_id', 'observation_datetime', 'value_as_number'] } agg_primitives_device_exposure = [ 'count', 'avg_time_between', 'time_since_first' ] include_entities_device_exposure = ['device_exposure'] trans_primitives = ['age'] groupby_trans_primitives = [] include_entities = ['person'] primitive_options = { tuple(trans_primitives): { 'include_entities': include_entities }, tuple(agg_primitives): { 'include_variables': include_variables }, tuple(agg_primitives_device_exposure): { 'include_entities': include_entities_device_exposure }, } ignore_entities = [ goldstandard_id, 'condition_occurrence', 'drug_exposure', 'observation_period', 'procedure_occurrence', 'visit_occurrence' ] ignore_variables = {} where_primitives = agg_primitives entityset['measurement'][ 'measurement_concept_id'].interesting_values = entityset[ 'measurement'].df['measurement_concept_id'].unique() entityset['observation'][ 'observation_concept_id'].interesting_values = entityset[ 'observation'].df['observation_concept_id'].unique() # if isinstance(entityset.entities[0].df, pandas.DataFrame): # agg_primitives = agg_primitives_all # else: # agg_primitives = agg_primitives_dask # build features if saved_features is None: with yaspin(color="yellow") as spinner: spinner.write( "No features definition file specified, calculating feature matrix from ground zero ... " ) feature_defs = ft.dfs( entityset=entityset, target_entity="person", features_only=True, agg_primitives=agg_primitives + agg_primitives_device_exposure, trans_primitives=trans_primitives, groupby_trans_primitives=groupby_trans_primitives, primitive_options=primitive_options, ignore_entities=ignore_entities, ignore_variables=ignore_variables, where_primitives=where_primitives, max_depth=2) spinner.write("> generated {} features".format(len(feature_defs))) if features_only: return feature_defs tic = time.perf_counter() feature_matrix = ft.calculate_feature_matrix( feature_defs, entityset) if isinstance(entityset.entities[0].df, dd.DataFrame): feature_matrix = feature_matrix.compute() toc = time.perf_counter() spinner.write( f"> feature matrix calculate completed in {toc - tic:0.4f} seconds" ) if feature_matrix_encode: feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) spinner.write( "> generated {} encoded features and the feature matrix". format(len(features_enc))) spinner.ok("Done") else: with yaspin(color="yellow") as spinner: spinner.write( "Using saved features from {} ... ".format(saved_features)) feature_defs = ft.load_features(saved_features) spinner.write("> {} features loaded from {}".format( len(feature_defs), saved_features)) tic = time.perf_counter() feature_matrix = ft.calculate_feature_matrix( feature_defs, entityset) if isinstance(entityset.entities[0].df, dd.DataFrame): feature_matrix = feature_matrix.compute() toc = time.perf_counter() spinner.write( f"> feature matrix calculate complete in {toc - tic:0.4f} seconds" ) spinner.ok("Done") if goldstandard_exist: if isinstance(entityset.entities[0].df, dd.DataFrame): goldstandard = entityset['goldstandard'].df.compute() else: goldstandard = entityset['goldstandard'].df if feature_matrix_encode: feature_matrix = feature_matrix_enc if goldstandard_exist: feature_matrix = feature_matrix.merge(goldstandard, on='person_id', how='right') return feature_matrix, feature_defs
def dfsWindow(self, target_entity, time_scope=None, training_window=None, cutoff_times=None, max_depth=1, chunk_size=None, n_jobs=1): '''Runs dfs on the target_entity and outputs a feature matrix with features based on the training_window and time_scope relative to cutoff times. If no training_window, time_scope, or cutoff_times are specified, regular dfs will run without using cutoff times. target_entity: str. Name of target_entity in entity set to run dfs on. The index of the target_entity must match the instance_id column in the cutoff_times table. time_scope: 'daily', 'weekly' or 'monthly'. Assumes 7 days in a week, and 31 days in a month. training_window: list of integers that refer to the number of months or weeks depending on the time_scope. Ex. [1, 2] for time_scope='monthly' returns features based on the last month and last 2 months from the cutoff date. cutoff_times: Pandas dataframe with instance_id, cutoff_dates, and label (label is optional). Any columns after instance_id and cutoff_dates will not be used for feature synthesis. The instance_id column must match the index of the target entity. max_depth: integer, defines how many levels of dfs to run. For example if max_depth = 2 on a transactions table, features returned include avg. transactions and avg. of avg. transactions. chunk_size: integer, float, None, or "cutoff time". Number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, it will use that many rows per chunk. If passed a float value between 0 and 1, sets the chunk size to that percentage of all instances. If passed “cutoff time”, rows are split per cutoff time. n_jobs: integer. The number of parallel processes to use when creating the feature matrix. ''' orig_window = training_window if (time_scope is None) or (training_window is None) or (cutoff_times is None): self.df, feature_defs = ft.dfs( entityset=self.es, target_entity=target_entity, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, where_primitives=self.where_primitives, max_depth=max_depth, features_only=False, verbose=1, chunk_size=chunk_size, n_jobs=n_jobs) else: self.df, feature_defs = ft.dfs( entityset=self.es, target_entity=target_entity, cutoff_time=cutoff_times, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, where_primitives=self.where_primitives, max_depth=max_depth, features_only=False, verbose=1, chunk_size=chunk_size, n_jobs=n_jobs, cutoff_time_in_index=True) if time_scope == 'daily': training_window = [int(x) for x in orig_window] for i in range(len(training_window)): feature_matrix = ft.calculate_feature_matrix( entityset=self.es, features=feature_defs, cutoff_time=cutoff_times, chunk_size=chunk_size, cutoff_time_in_index=True, n_jobs=n_jobs, training_window=ft.Timedelta(training_window[i], "d")) suffix = '_' + str(orig_window[i]) + 'day' feature_matrix = feature_matrix.add_suffix(suffix) self.df = pd.concat([self.df, feature_matrix], axis=1, join='inner') elif time_scope == 'monthly': training_window = [x * 30 for x in orig_window] for i in range(len(training_window)): feature_matrix = ft.calculate_feature_matrix( entityset=self.es, features=feature_defs, cutoff_time=cutoff_times, chunk_size=chunk_size, cutoff_time_in_index=True, n_jobs=n_jobs, training_window=ft.Timedelta(training_window[i], "d")) suffix = '_' + str(orig_window[i]) + 'mos' feature_matrix = feature_matrix.add_suffix(suffix) self.df = pd.concat([self.df, feature_matrix], axis=1, join='inner') elif time_scope == 'weekly': training_window = [x * 7 for x in orig_window] for i in range(len(training_window)): feature_matrix, feature_defs = ft.dfs( entityset=self.es, target_entity=target_entity, cutoff_time=cutoff_times, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, where_primitives=self.where_primitives, max_depth=max_depth, features_only=False, verbose=1, chunk_size=chunk_size, cutoff_time_in_index=True, n_jobs=n_jobs, training_window=ft.Timedelta(training_window[i], "d")) suffix = '_' + str(orig_window[i]) + 'wks' feature_matrix = feature_matrix.add_suffix(suffix) self.df = pd.concat([self.df, feature_matrix], axis=1, join='inner') else: print("ERROR: time_scope entered is not one of the options.") drop_duplicates = DropDuplicate() self.df = drop_duplicates.fit_transform(self.df) for i in self.df.columns: self.feature_defs.append(i) return self.df
def test_empty_child_dataframe(parent_child): parent_df, child_df = parent_child child_ltypes = { 'parent_id': Integer, 'time_index': Datetime, 'value': Double, 'cat': Categorical } es = ft.EntitySet(id="blah") es.add_dataframe(dataframe_name="parent", dataframe=parent_df, index="id") es.add_dataframe(dataframe_name="child", dataframe=child_df, index="id", time_index="time_index", logical_types=child_ltypes) es.add_relationship("parent", "id", "child", "parent_id") # create regular agg count = ft.Feature(es["child"].ww["id"], parent_dataframe_name="parent", primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([ ft.Feature(es["child"].ww["value"]), ft.Feature(es["child"].ww['time_index']) ], parent_dataframe_name="parent", primitive=Trend) # create multi-output agg feature n_most_common = ft.Feature(es["child"].ww["cat"], parent_dataframe_name="parent", primitive=NMostCommon) # create aggs with where where = ft.Feature(es["child"].ww["value"]) == 1 count_where = ft.Feature(es["child"].ww["id"], parent_dataframe_name="parent", where=where, primitive=Count) trend_where = ft.Feature([ ft.Feature(es["child"].ww["value"]), ft.Feature(es["child"].ww["time_index"]) ], parent_dataframe_name="parent", where=where, primitive=Trend) n_most_common_where = ft.Feature(es["child"].ww["cat"], parent_dataframe_name="parent", where=where, primitive=NMostCommon) if isinstance(parent_df, pd.DataFrame): features = [ count, count_where, trend, trend_where, n_most_common, n_most_common_where ] data = { count.get_name(): pd.Series([0], dtype="Int64"), count_where.get_name(): pd.Series([0], dtype="Int64"), trend.get_name(): pd.Series([np.nan], dtype="float"), trend_where.get_name(): pd.Series([np.nan], dtype="float") } for name in n_most_common.get_feature_names(): data[name] = pd.Series([np.nan], dtype="category") for name in n_most_common_where.get_feature_names(): data[name] = pd.Series([np.nan], dtype="category") else: features = [count, count_where] data = { count.get_name(): pd.Series([0], dtype="Int64"), count_where.get_name(): pd.Series([0], dtype="Int64") } answer = pd.DataFrame(data) # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("12/31/2017")) fm = to_pandas(fm) for column in data.keys(): pd.testing.assert_series_equal(fm[column], answer[column], check_names=False, check_index=False) # cutoff time after all rows, but where clause filters all rows if isinstance(parent_df, pd.DataFrame): features = [count_where, trend_where, n_most_common_where] data = { count_where.get_name(): pd.Series([0], dtype="Int64"), trend_where.get_name(): pd.Series([np.nan], dtype="float") } for name in n_most_common_where.get_feature_names(): data[name] = pd.Series([np.nan], dtype="category") else: features = [count_where] data = {count_where.get_name(): pd.Series([0], dtype="Int64")} answer = pd.DataFrame(data) fm2 = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("1/4/2018")) fm2 = to_pandas(fm2) for column in data.keys(): pd.testing.assert_series_equal(fm[column], answer[column], check_names=False, check_index=False)
def test_handles_primitive_function_name_uniqueness(es): if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities): pytest.xfail( "Fails with Dask and Koalas due conflicting aggregation primitive names" ) class SumTimesN(AggregationPrimitive): name = "sum_times_n" input_types = [Numeric] return_type = Numeric def __init__(self, n): self.n = n def get_function(self, agg_type='pandas'): def my_function(values): return values.sum() * self.n return my_function # works as expected f1 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=SumTimesN(n=1)) fm = ft.calculate_feature_matrix(features=[f1], entityset=es) value_sum = pd.Series([56, 26, 0]) assert all(fm[f1.get_name()].sort_index() == value_sum) # works as expected f2 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=SumTimesN(n=2)) fm = ft.calculate_feature_matrix(features=[f2], entityset=es) double_value_sum = pd.Series([112, 52, 0]) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # same primitive, same variable, different args fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=es) assert all(fm[f1.get_name()].sort_index() == value_sum) assert all(fm[f2.get_name()].sort_index() == double_value_sum) # different primtives, same function returned by get_function, # different base features f3 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum) f4 = ft.Feature(es["log"]["purchased"], parent_entity=es["customers"], primitive=NumTrue) fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=es) purchased_sum = pd.Series([10, 1, 1]) assert all(fm[f3.get_name()].sort_index() == value_sum) assert all(fm[f4.get_name()].sort_index() == purchased_sum)\ # different primtives, same function returned by get_function, # same base feature class Sum1(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum1" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self, agg_type='pandas'): return np.sum class Sum2(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum2" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self, agg_type='pandas'): return np.sum class Sum3(AggregationPrimitive): """Sums elements of a numeric or boolean feature.""" name = "sum3" input_types = [Numeric] return_type = Numeric stack_on_self = False stack_on_exclude = [Count] default_value = 0 def get_function(self, agg_type='pandas'): return np.sum f5 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum1) f6 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum2) f7 = ft.Feature(es["log"]["value"], parent_entity=es["customers"], primitive=Sum3) fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=es) assert all(fm[f5.get_name()].sort_index() == value_sum) assert all(fm[f6.get_name()].sort_index() == value_sum) assert all(fm[f7.get_name()].sort_index() == value_sum)
def sample(self, n=10, cutoff_time=None): from featuretools import calculate_feature_matrix cfm = calculate_feature_matrix([self], cutoff_time=cutoff_time) return cfm.sample(n)