Esempio n. 1
0
def test_accepts_cutoff_time_df(dataframes, relationships):
    cutoff_times_df = pd.DataFrame({
        "instance_id": [1, 2, 3],
        "time": [10, 12, 15]
    })
    feature_matrix, features = dfs(
        dataframes=dataframes,
        relationships=relationships,
        target_dataframe_name="transactions",
        cutoff_time=cutoff_times_df,
    )
    feature_matrix = to_pandas(feature_matrix, index="id", sort_index=True)
    assert len(feature_matrix.index) == 3
    assert len(feature_matrix.columns) == len(features)
Esempio n. 2
0
def test_replace_inf_values(divide_by_zero_es):
    div_by_scalar = DivideNumericScalar(value=0)
    div_by_feature = DivideByFeature(value=1)
    div_by_feature_neg = DivideByFeature(value=-1)
    for primitive in [
            'divide_numeric', div_by_scalar, div_by_feature, div_by_feature_neg
    ]:
        fm, _ = ft.dfs(entityset=divide_by_zero_es,
                       target_dataframe_name='zero',
                       trans_primitives=[primitive])
        assert np.inf in to_pandas(fm).values or -np.inf in to_pandas(
            fm).values
        replaced_fm = replace_inf_values(fm)
        replaced_fm = to_pandas(replaced_fm)
        assert np.inf not in replaced_fm.values
        assert -np.inf not in replaced_fm.values

        custom_value_fm = replace_inf_values(fm,
                                             replacement_value='custom_val')
        custom_value_fm = to_pandas(custom_value_fm)
        assert np.inf not in custom_value_fm.values
        assert -np.inf not in replaced_fm.values
        assert 'custom_val' in custom_value_fm.values
def test_accepts_cutoff_time_compose(entities, relationships):
    def fraud_occured(df):
        return df['fraud'].any()

    lm = cp.LabelMaker(target_entity='card_id',
                       time_index='transaction_time',
                       labeling_function=fraud_occured,
                       window_size=1)

    transactions_df = to_pandas(entities['transactions'][0])

    labels = lm.search(transactions_df, num_examples_per_instance=-1)

    labels['time'] = pd.to_numeric(labels['time'])
    labels.rename({'card_id': 'id'}, axis=1, inplace=True)

    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="cards",
                                   cutoff_time=labels)
    feature_matrix = to_pandas(feature_matrix, index='id')
    assert len(feature_matrix.index) == 6
    assert len(feature_matrix.columns) == len(features) + 1
def test_with_features_built_from_es_metadata(es):
    metadata = es.metadata
    agg_feat = ft.Feature(metadata['log']['id'],
                          parent_entity=metadata['customers'],
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[agg_feat.get_name()].values[0]
    assert (v == 10)
def test_ignores_instance_ids_if_cutoff_df(entities, relationships):
    cutoff_times_df = pd.DataFrame({
        "instance_id": [1, 2, 3],
        "time": [10, 12, 15]
    })
    instance_ids = [1, 2, 3, 4, 5]
    feature_matrix, features = dfs(entities=entities,
                                   relationships=relationships,
                                   target_entity="transactions",
                                   cutoff_time=cutoff_times_df,
                                   instance_ids=instance_ids)
    feature_matrix = to_pandas(feature_matrix, index='id')
    assert len(feature_matrix.index) == 3
    assert len(feature_matrix.columns) == len(features)
Esempio n. 6
0
def lt(es):
    def label_func(df):
        return df['value'].sum() > 10

    lm = cp.LabelMaker(target_entity='id',
                       time_index='datetime',
                       labeling_function=label_func,
                       window_size='1m')

    df = es['log'].df
    df = to_pandas(df)
    labels = lm.search(df, num_examples_per_instance=-1)
    labels = labels.rename(columns={'cutoff_time': 'time'})
    return labels
 def test_parent(self, values_es, true_values_lti):
     # test entity with time index and all instances in child entity
     if not all(
             isinstance(entity.df, pd.DataFrame)
             for entity in values_es.entities):
         pytest.xfail(
             'possible issue with either normalize_entity or add_last_time_indexes'
         )
     values_es.add_last_time_indexes()
     values = values_es['values']
     assert len(values.last_time_index) == 11
     sorted_lti = to_pandas(values.last_time_index).sort_index()
     for v1, v2 in zip(sorted_lti, true_values_lti):
         assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
Esempio n. 8
0
def test_override_cmp_from_column(es):
    count_lo = ft.Feature(es['log'].ww['value']) > 1

    to_test = [False, True, True]

    features = [count_lo]

    df = to_pandas(ft.calculate_feature_matrix(entityset=es,
                                               features=features,
                                               instance_ids=[0, 1, 2]),
                   index='id',
                   sort_index=True)
    v = df[count_lo.get_name()].tolist()
    for i, test in enumerate(to_test):
        assert v[i] == test
def extra_session_df(es):
    row_values = {
        'customer_id': 2,
        'device_name': 'PC',
        'device_type': 0,
        'id': 6
    }
    row = pd.DataFrame(row_values, index=pd.Index([6], name='id'))
    df = to_pandas(es['sessions'].df)
    df = df.append(row, sort=True).sort_index()
    if isinstance(es['sessions'].df, dd.DataFrame):
        df = dd.from_pandas(df, npartitions=3)
    if ks and isinstance(es['sessions'].df, ks.DataFrame):
        df = ks.from_pandas(df)
    return df
Esempio n. 10
0
def test_eq(es):
    other_es = make_ecommerce_entityset()
    latlong = es['log'].df['latlong'].copy()

    assert es['log'].__eq__(es['log'], deep=True)
    assert es['log'].__eq__(other_es['log'], deep=True)
    assert all(to_pandas(es['log'].df['latlong']).eq(to_pandas(latlong)))

    other_es['log'].add_interesting_values()
    assert not es['log'].__eq__(other_es['log'], deep=True)

    es['log'].id = 'customers'
    es['log'].index = 'notid'
    assert not es['customers'].__eq__(es['log'], deep=True)

    es['log'].index = 'id'
    assert not es['customers'].__eq__(es['log'], deep=True)

    es['log'].time_index = 'signup_date'
    assert not es['customers'].__eq__(es['log'], deep=True)

    es['log'].secondary_time_index = {
        'cancel_date': ['cancel_reason', 'cancel_date']}
    assert not es['customers'].__eq__(es['log'], deep=True)
Esempio n. 11
0
def test_make_agg_feat_where_count(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          where=IdentityFeature(
                              es['log']['product_id']) == 'coke zero',
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 3)
Esempio n. 12
0
def test_use_previous_pd_dateoffset(es):
    total_events_pd = ft.Feature(es["log"]["id"],
                                 parent_entity=es["customers"],
                                 use_previous=pd.DateOffset(hours=47,
                                                            minutes=60),
                                 primitive=Count)

    feature_matrix = ft.calculate_feature_matrix(
        [total_events_pd],
        es,
        cutoff_time=pd.Timestamp('2011-04-11 10:31:30'),
        instance_ids=[0, 1, 2])
    feature_matrix = to_pandas(feature_matrix, index='id', sort_index=True)
    col_name = list(feature_matrix.head().keys())[0]
    assert (feature_matrix[col_name] == [1, 5, 2]).all()
    def test_parent_no_time_index_missing(self, es, extra_session_df,
                                          true_sessions_lti):
        # test entity without time index and not all instance have children
        sessions = es['sessions']

        # add session instance with no associated log instances
        sessions.update_data(extra_session_df)
        es.add_last_time_indexes()
        # since sessions has no time index, default value is NaT
        true_sessions_lti[6] = pd.NaT

        assert len(sessions.last_time_index) == 7
        sorted_lti = to_pandas(sessions.last_time_index).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
Esempio n. 14
0
def test_make_agg_feat_using_prev_time(es):
    agg_feat = ft.Feature(es['log']['id'],
                          parent_entity=es['sessions'],
                          use_previous=Timedelta(10, 's'),
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 10),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 2)

    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(
                                          2011, 4, 9, 10, 30, 30),
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert (v == 1)
Esempio n. 15
0
def test_with_features_built_from_es_metadata(es):
    metadata = es.metadata

    agg_feat = ft.Feature(metadata["log"].ww["id"],
                          parent_dataframe_name="customers",
                          primitive=Count)

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index="id")
    v = df[agg_feat.get_name()].values[0]
    assert v == 10
    def test_multiple_children_left_missing(
        self, es, extra_session_df, wishlist_df, true_sessions_lti
    ):
        if es.dataframe_type == Library.SPARK.value:
            pytest.xfail("Cannot make index on a Spark DataFrame")

        # add row to sessions so not all session instances are in log
        es.replace_dataframe(dataframe_name="sessions", df=extra_session_df)

        # add row to wishlist df so new session instance in in wishlist_log
        row_values = {
            "session_id": 6,
            "datetime": pd.Timestamp("2011-04-11 11:11:11"),
            "product_id": "toothpaste",
        }
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)
        if es.dataframe_type == Library.DASK.value:
            df = dd.from_pandas(df, npartitions=2)
        logical_types = {
            "session_id": Integer,
            "datetime": Datetime,
            "product_id": Categorical,
        }
        es.add_dataframe(
            dataframe_name="wishlist_log",
            dataframe=df,
            index="id",
            make_index=True,
            time_index="datetime",
            logical_types=logical_types,
        )
        es.add_relationship("sessions", "id", "wishlist_log", "session_id")
        es.add_last_time_indexes()

        # test all instances in right child
        sessions = es["sessions"]

        # now wishlist_log has newer events for 3 session ids
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        lti_name = sessions.ww.metadata.get("last_time_index")
        assert len(sessions[lti_name]) == 7
        sorted_lti = to_pandas(sessions[lti_name]).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
    def test_multiple_children_all_combined(self, es, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test some instances in right, some in left, all when combined
        sessions = es['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist_log so extra session has child instance
        row_values = {
            'session_id': 6,
            'datetime': pd.Timestamp("2011-04-11 11:11:11"),
            'product_id': 'toothpaste'
        }
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)

        # drop instance 4 so wishlist_log does not have session id 3 instance
        df.drop(4, inplace=True)
        if isinstance(es.entities[0].df, dd.DataFrame):
            df = dd.from_pandas(df, npartitions=2)
        if ks and isinstance(es.entities[0].df, ks.DataFrame):
            df = ks.from_pandas(df)
        variable_types = {
            'id': ft.variable_types.variable.Index,
            'session_id': ft.variable_types.variable.Numeric,
            'datetime': ft.variable_types.variable.DatetimeTimeIndex,
            'product_id': ft.variable_types.variable.Categorical
        }
        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime',
                                 variable_types=variable_types)
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()

        # wishlist has newer events for 2 sessions
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        sorted_lti = to_pandas(sessions.last_time_index).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
Esempio n. 18
0
def test_override_cmp(es):
    count = ft.Feature(es["log"].ww["id"],
                       parent_dataframe_name="sessions",
                       primitive=Count)
    _sum = ft.Feature(es["log"].ww["value"],
                      parent_dataframe_name="sessions",
                      primitive=Sum)
    gt_lo = count > 1
    gt_other = count > _sum
    ge_lo = count >= 1
    ge_other = count >= _sum
    lt_hi = count < 10
    lt_other = count < _sum
    le_hi = count <= 10
    le_other = count <= _sum
    ne_lo = count != 1
    ne_other = count != _sum

    to_test = [
        [True, True, False],
        [False, False, True],
        [True, True, True],
        [False, False, True],
        [True, True, True],
        [True, True, False],
        [True, True, True],
        [True, True, False],
    ]
    features = [
        gt_lo,
        gt_other,
        ge_lo,
        ge_other,
        lt_hi,
        lt_other,
        le_hi,
        le_other,
        ne_lo,
        ne_other,
    ]

    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=[0, 1, 2])
    df = to_pandas(df, index="id", sort_index=True)
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].tolist()
        assert v == test
def test_direct_from_identity(es):
    device = Feature(es['sessions'].ww['device_type'])
    d = DirectFeature(base_feature=device, child_dataframe_name='log')

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index='id', sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.KOALAS.value:
        expected = ['0', '1']
    else:
        expected = [0, 1]
    assert v == expected
Esempio n. 20
0
def test_make_agg_feat_where_count(es):
    agg_feat = ft.Feature(
        es["log"].ww["id"],
        parent_dataframe_name="sessions",
        where=IdentityFeature(es["log"].ww["product_id"]) == "coke zero",
        primitive=Count,
    )

    feature_set = FeatureSet([agg_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[agg_feat.get_name()][0]
    assert v == 3
def test_equal_categorical(simple_es):
    f1 = ft.Feature([
        ft.IdentityFeature(simple_es['values'].ww['value']),
        ft.IdentityFeature(simple_es['values'].ww['value2'])
    ],
                    primitive=Equal)

    df = ft.calculate_feature_matrix(entityset=simple_es, features=[f1])
    if simple_es.dataframe_type != Library.KOALAS.value:
        # Koalas does not support categorical dtype
        assert set(simple_es['values']['value'].cat.categories) != \
            set(simple_es['values']['value2'].cat.categories)
    assert to_pandas(df, index='id',
                     sort_index=True)['value = value2'].to_list() == [
                         True, False, False, True
                     ]
Esempio n. 22
0
def test_direct_from_identity(es):
    device = Feature(es["sessions"].ww["device_type"])
    d = DirectFeature(base_feature=device, child_dataframe_name="log")

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    df = to_pandas(df, index="id", sort_index=True)
    v = df[d.get_name()].tolist()
    if es.dataframe_type == Library.SPARK.value:
        expected = ["0", "1"]
    else:
        expected = [0, 1]
    assert v == expected
Esempio n. 23
0
def lt(es):
    def label_func(df):
        return df["value"].sum() > 10

    lm = cp.LabelMaker(
        target_dataframe_name="id",
        time_index="datetime",
        labeling_function=label_func,
        window_size="1m",
    )

    df = es["log"]
    df = to_pandas(df)
    labels = lm.search(df, num_examples_per_instance=-1)
    labels = labels.rename(columns={"cutoff_time": "time"})
    return labels
Esempio n. 24
0
    def test_parent_no_time_index_missing(self, es, extra_session_df,
                                          true_sessions_lti):
        # test dataframe without time index and not all instance have children

        # add session instance with no associated log instances
        es.replace_dataframe(dataframe_name='sessions', df=extra_session_df)
        es.add_last_time_indexes()
        # since sessions has no time index, default value is NaT
        true_sessions_lti[6] = pd.NaT
        sessions = es['sessions']

        lti_name = sessions.ww.metadata.get('last_time_index')
        assert len(sessions[lti_name]) == 7
        sorted_lti = to_pandas(sessions[lti_name]).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
Esempio n. 25
0
def test_equal_categorical(simple_es):
    f1 = ft.Feature(
        [simple_es['values']['value'], simple_es['values']['value2']],
        primitive=Equal)

    df = ft.calculate_feature_matrix(entityset=simple_es, features=[f1])
    if all(
            isinstance(e.df, (pd.DataFrame, dd.DataFrame))
            for e in simple_es.entities):
        # Koalas does not support categorical dtype
        assert set(simple_es['values'].df['value'].cat.categories) != \
            set(simple_es['values'].df['value2'].cat.categories)
    assert to_pandas(df, index='id',
                     sort_index=True)['value = value2'].to_list() == [
                         True, False, False, True
                     ]
Esempio n. 26
0
    def test_multiple_children_all_combined(self, es, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        if ks and isinstance(es.dataframes[0], ks.DataFrame):
            pytest.xfail('Cannot make index on a Koalas DataFrame')

        # add row to sessions so not all session instances are in log
        es.replace_dataframe(dataframe_name='sessions', df=extra_session_df)

        # add row to wishlist_log so extra session has child instance
        row_values = {
            'session_id': 6,
            'datetime': pd.Timestamp("2011-04-11 11:11:11"),
            'product_id': 'toothpaste'
        }
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)

        # drop instance 4 so wishlist_log does not have session id 3 instance
        df.drop(4, inplace=True)
        if es.dataframe_type == Library.DASK.value:
            df = dd.from_pandas(df, npartitions=2)
        logical_types = {
            'session_id': Integer,
            'datetime': Datetime,
            'product_id': Categorical
        }
        es.add_dataframe(dataframe_name="wishlist_log",
                         dataframe=df,
                         index='id',
                         make_index=True,
                         time_index='datetime',
                         logical_types=logical_types)
        es.add_relationship('sessions', 'id', 'wishlist_log', 'session_id')
        es.add_last_time_indexes()

        # test some instances in right, some in left, all when combined
        sessions = es['sessions']

        # wishlist has newer events for 2 sessions
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        lti_name = sessions.ww.metadata.get('last_time_index')
        assert len(sessions[lti_name]) == 7
        sorted_lti = to_pandas(sessions[lti_name]).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
Esempio n. 27
0
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat,
                                   parent_entity=es['customers'],
                                   primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[customer_sum_feat.get_name()].values[0]
    assert (v == 10)
Esempio n. 28
0
def extra_session_df(es):
    row_values = {
        'customer_id': 2,
        'device_name': 'PC',
        'device_type': 0,
        'id': 6
    }
    row = pd.DataFrame(row_values, index=pd.Index([6], name='id'))
    df = to_pandas(es['sessions'])
    df = df.append(row, sort=True).sort_index()
    if es.dataframe_type == Library.DASK.value:
        df = dd.from_pandas(df, npartitions=3)
    elif es.dataframe_type == Library.KOALAS.value:
        # Koalas can't handle object dtypes
        df = df.astype('string')
        df = ks.from_pandas(df)
    return df
Esempio n. 29
0
def test_make_agg_feat_of_agg_feat(es):
    log_count_feat = ft.Feature(es["log"].ww["id"],
                                parent_dataframe_name="sessions",
                                primitive=Count)

    customer_sum_feat = ft.Feature(log_count_feat,
                                   parent_dataframe_name="customers",
                                   primitive=Sum)

    feature_set = FeatureSet([customer_sum_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index="id")
    v = df[customer_sum_feat.get_name()].values[0]
    assert v == 10
Esempio n. 30
0
def test_override_boolean(es):
    count = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count)
    count_lo = ft.Feature(count, primitive=GreaterThanScalar(1))
    count_hi = ft.Feature(count, primitive=LessThanScalar(10))

    to_test = [[True, True, True],
               [True, True, False],
               [False, False, True]]

    features = []
    features.append(count_lo.OR(count_hi))
    features.append(count_lo.AND(count_hi))
    features.append(~(count_lo.AND(count_hi)))

    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2])
    df = to_pandas(df, index='id', sort_index=True)
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test