Python make_agg_primitive Beispiele, featuretools.primitives.base.make_agg_primitive Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: rgolovnya/featuretools

def test_custom_primitive_time_as_arg(es):
    def time_since_last(values, time):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last,
                                       [DatetimeTimeIndex],
                                       Numeric,
                                       uses_calc_time=True)
    assert TimeSinceLast.name == "time_since_last"
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = ft.calculate_feature_matrix([f],
                                     entityset=es,
                                     instance_ids=[0, 1, 2],
                                     cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_agg_primitive(time_since_last,
                           [DatetimeTimeIndex],
                           Numeric,
                           uses_calc_time=False)

Beispiel #2

0

Datei anzeigen

def test_time_since_last_custom(pd_es):
    def time_since_last(values, time=None):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(
        time_since_last,
        [ColumnSchema(logical_type=Datetime, semantic_tags={'time_index'})],
        ColumnSchema(semantic_tags={'numeric'}),
        name="time_since_last",
        uses_calc_time=True)
    f = ft.Feature(pd_es["log"].ww["datetime"],
                   parent_dataframe_name="customers",
                   primitive=TimeSinceLast)
    fm = ft.calculate_feature_matrix([f],
                                     entityset=pd_es,
                                     instance_ids=[0, 1, 2],
                                     cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        TimeSinceLast = make_agg_primitive(time_since_last, [
            ColumnSchema(logical_type=Datetime, semantic_tags={'time_index'})
        ],
                                           ColumnSchema(
                                               semantic_tags={'numeric'}),
                                           uses_calc_time=False)

Beispiel #3

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: yuv4r4j/featuretools

def test_custom_primitive_time_as_arg(es):
    def time_since_last(values, time):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                                       Numeric,
                                       uses_calc_time=True)
    assert TimeSinceLast.name == "time_since_last"
    f = ft.Feature(es["log"]["datetime"],
                   parent_entity=es["customers"],
                   primitive=TimeSinceLast)
    fm = ft.calculate_feature_matrix([f],
                                     entityset=es,
                                     instance_ids=[0, 1, 2],
                                     cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                           Numeric,
                           uses_calc_time=False)

Beispiel #4

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: mikewcasale/featuretools

def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """
    # TODO: Update to work with Dask and Koalas
    if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities):
        pytest.xfail(
            "Cannot use primitives made with make_agg_primitives with Dask or Koalas EntitySets"
        )
    # test with normally defined functions

    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Sum)
    f_max = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Max)

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(lambda x: x.sum(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")
    Max = make_agg_primitive(lambda x: x.max(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Sum)
    f_max = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Max)
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

Beispiel #5

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: yuv4r4j/featuretools

def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """

    # test with normally defined functions
    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Sum)
    f_max = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Max)

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(lambda x: x.sum(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")
    Max = make_agg_primitive(lambda x: x.max(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Sum)
    f_max = ft.Feature(es["log"]["value"],
                       parent_entity=es["customers"],
                       primitive=Max)
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

Beispiel #6

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: mikewcasale/featuretools

def test_make_three_most_common(pd_es):
    def pd_top3(x):
        array = np.array(x.value_counts()[:3].index)
        if len(array) < 3:
            filler = np.full(3 - len(array), np.nan)
            array = np.append(array, filler)
        return array

    NMostCommoner = make_agg_primitive(function=pd_top3,
                                       input_types=[Discrete],
                                       return_type=Discrete,
                                       number_output_features=3)

    fm, features = ft.dfs(entityset=pd_es,
                          target_entity="customers",
                          instance_ids=[0, 1, 2],
                          agg_primitives=[NMostCommoner],
                          trans_primitives=[])

    df = fm[["PD_TOP3(log.product_id)[%s]" % i for i in range(3)]]

    assert set(df.iloc[0].values[:2]) == set([
        'coke zero', 'toothpaste'
    ])  # coke zero and toothpaste have same number of occurrences
    assert df.iloc[0].values[2] in ['car', 'brown bag'
                                    ]  # so just check that the top two match

    assert df.iloc[1].reset_index(drop=True).equals(
        pd.Series(['coke zero', 'Haribo sugar-free gummy bears', np.nan]))
    assert df.iloc[2].reset_index(drop=True).equals(
        pd.Series(['taco clock', np.nan, np.nan]))

Beispiel #7

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: yszoke/PredictionSystem

def test_override_multi_feature_names(pd_es):
    def gen_custom_names(primitive, base_feature_names, relationship_path_name,
                         parent_entity_id, where_str, use_prev_str):
        base_string = 'Custom_%s({}.{})'.format(parent_entity_id, base_feature_names)
        return [base_string % i for i in range(primitive.number_output_features)]

    def pd_top3(x):
        array = np.array(x.value_counts()[:3].index)
        if len(array) < 3:
            filler = np.full(3 - len(array), np.nan)
            array = np.append(array, filler)
        return array

    num_features = 3
    NMostCommoner = make_agg_primitive(function=pd_top3,
                                       input_types=[Numeric],
                                       return_type=Discrete,
                                       number_output_features=num_features,
                                       cls_attributes={"generate_names": gen_custom_names})

    fm, features = ft.dfs(entityset=pd_es,
                          target_entity="products",
                          instance_ids=[0, 1, 2],
                          agg_primitives=[NMostCommoner],
                          trans_primitives=[])

    expected_names = []
    base_names = [['value'], ['value_2'], ['value_many_nans']]
    for name in base_names:
        expected_names += gen_custom_names(NMostCommoner, name, None, 'products', None, None)

    for name in expected_names:
        assert name in fm.columns

Beispiel #8

0

Datei anzeigen

Datei: test_deep_feature_synthesis.py Projekt: xmyyj001/featuretools

def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)

Beispiel #9

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: rgolovnya/featuretools

def test_custom_primitive_multiple_inputs(es):
    def mean_sunday(numeric, datetime):
        '''
        Finds the mean of non-null values of a feature that occurred on Sundays
        '''
        days = pd.DatetimeIndex(datetime).weekday.values
        df = pd.DataFrame({'numeric': numeric, 'time': days})
        return df[df['time'] == 6]['numeric'].mean()

    MeanSunday = make_agg_primitive(function=mean_sunday,
                                    input_types=[Numeric, Datetime],
                                    return_type=Numeric)

    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[])
    mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None])
    iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value)
    for x, y in iterator:
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

    es.add_interesting_values()
    mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None])
    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[],
                          where_primitives=[MeanSunday])
    where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)"
    for x, y in zip(fm[where_feat], mean_sunday_value_priority_0):
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

Beispiel #10

0

Datei anzeigen

def test_count_null_and_make_agg_primitive(pd_es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self, base_feature_names, relationship_path_name,
                            parent_dataframe_name, where_str, use_prev_str):
        return u"COUNT(%s%s%s)" % (relationship_path_name, where_str,
                                   use_prev_str)

    Count = make_agg_primitive(
        count_func,
        [[ColumnSchema(semantic_tags={'foreign_key'})], [ColumnSchema()]],
        ColumnSchema(semantic_tags={'numeric'}),
        name="count",
        stack_on_self=False,
        cls_attributes={"generate_name": count_generate_name})
    count_null = ft.Feature(pd_es['log'].ww['value'],
                            parent_dataframe_name='sessions',
                            primitive=Count(count_null=True))
    feature_matrix = ft.calculate_feature_matrix([count_null], entityset=pd_es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()

Beispiel #11

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: yuv4r4j/featuretools

def test_custom_primitive_multiple_inputs(es):
    def mean_sunday(numeric, datetime):
        '''
        Finds the mean of non-null values of a feature that occurred on Sundays
        '''
        days = pd.DatetimeIndex(datetime).weekday.values
        df = pd.DataFrame({'numeric': numeric, 'time': days})
        return df[df['time'] == 6]['numeric'].mean()

    MeanSunday = make_agg_primitive(function=mean_sunday,
                                    input_types=[Numeric, Datetime],
                                    return_type=Numeric)

    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[])
    mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None])
    iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value)
    for x, y in iterator:
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

    es.add_interesting_values()
    mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None])
    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[],
                          where_primitives=[MeanSunday])
    where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)"
    for x, y in zip(fm[where_feat], mean_sunday_value_priority_0):
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

Beispiel #12

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: yuv4r4j/featuretools

def test_make_three_most_common(es):
    def pd_top3(x):
        array = np.array(x.value_counts()[:3].index)
        if len(array) < 3:
            filler = np.full(3 - len(array), np.nan)
            array = np.append(array, filler)
        return array

    NMostCommoner = make_agg_primitive(function=pd_top3,
                                       input_types=[Discrete],
                                       return_type=Discrete,
                                       number_output_features=3)

    fm, features = ft.dfs(entityset=es,
                          target_entity="customers",
                          agg_primitives=[NMostCommoner],
                          trans_primitives=[])

    true_results = pd.DataFrame(
        [['coke zero', 'toothpaste', "car"],
         ['coke zero', 'Haribo sugar-free gummy bears', np.nan],
         ['taco clock', np.nan, np.nan]])
    df = fm[["PD_TOP3(log.product_id)__%s" % i for i in range(3)]]
    for i in range(df.shape[0]):
        if i == 0:
            # coke zero and toothpaste have same number of occurrences
            # so just check that the top two match
            assert set(true_results.iloc[i].values[:2]) == set(
                df.iloc[i].values[:2])
            assert df.iloc[0].values[2] in ("brown bag", "car")
        else:
            for i1, i2 in zip(true_results.iloc[i], df.iloc[i]):
                assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)

Beispiel #13

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: yuv4r4j/featuretools

def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self, base_feature_names, relationship_path_name,
                            parent_entity_id, where_str, use_prev_str):
        return u"COUNT(%s%s%s)" % (relationship_path_name, where_str,
                                   use_prev_str)

    Count = make_agg_primitive(
        count_func, [[Index], [Variable]],
        Numeric,
        name="count",
        stack_on_self=False,
        cls_attributes={"generate_name": count_generate_name})
    count_null = ft.Feature(es['log']['value'],
                            parent_entity=es['sessions'],
                            primitive=Count(count_null=True))
    feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()

Beispiel #14

0

Datei anzeigen

Datei: test_primitive_base.py Projekt: xxh422735676/featuretools

def test_uses_calc_time():
    def time_since_last(values, time=None):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                                       Numeric,
                                       name="time_since_last",
                                       uses_calc_time=True)
    primitive = TimeSinceLast()
    datetimes = pd.Series([datetime(2015, 6, 7), datetime(2015, 6, 6)])
    answer = 86400.0
    assert answer == primitive(datetimes, time=datetime(2015, 6, 8))

Beispiel #15

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: rgolovnya/featuretools

def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """

    # test with normally defined functions
    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(custom_primitive, input_types=[Numeric],
                             return_type=Numeric, name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(custom_primitive, input_types=[Numeric],
                             return_type=Numeric, name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric],
                             return_type=Numeric, name="sum")
    Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric],
                             return_type=Numeric, name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

Beispiel #16

0

Datei anzeigen

def test_custom_primitive_default_kwargs(es):
    def sum_n_times(numeric, n=1):
        return np.nan_to_num(numeric).sum(dtype=np.float) * n

    SumNTimes = make_agg_primitive(function=sum_n_times,
                                   input_types=[Numeric],
                                   return_type=Numeric)

    sum_n_1_n = 1
    sum_n_1_base_f = ft.Feature(es['log']['value'])
    sum_n_1 = ft.Feature([sum_n_1_base_f], parent_entity=es['sessions'], primitive=SumNTimes(n=sum_n_1_n))
    sum_n_2_n = 2
    sum_n_2_base_f = ft.Feature(es['log']['value_2'])
    sum_n_2 = ft.Feature([sum_n_2_base_f], parent_entity=es['sessions'], primitive=SumNTimes(n=sum_n_2_n))
    assert sum_n_1_base_f == sum_n_1.base_features[0]
    assert sum_n_1_n == sum_n_1.primitive.kwargs['n']
    assert sum_n_2_base_f == sum_n_2.base_features[0]
    assert sum_n_2_n == sum_n_2.primitive.kwargs['n']

Beispiel #17

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: rgolovnya/featuretools

def test_custom_primitive_default_kwargs(es):
    def sum_n_times(numeric, n=1):
        return np.nan_to_num(numeric).sum(dtype=np.float) * n

    SumNTimes = make_agg_primitive(function=sum_n_times,
                                   input_types=[Numeric],
                                   return_type=Numeric)

    sum_n_1_n = 1
    sum_n_1_base_f = Feature(es['log']['value'])
    sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n)
    sum_n_2_n = 2
    sum_n_2_base_f = Feature(es['log']['value_2'])
    sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n)
    assert sum_n_1_base_f == sum_n_1.base_features[0]
    assert sum_n_1_n == sum_n_1.kwargs['n']
    assert sum_n_2_base_f == sum_n_2.base_features[0]
    assert sum_n_2_n == sum_n_2.kwargs['n']

Beispiel #18

0

Datei anzeigen

def test_custom_primitive_default_kwargs(es):
    def sum_n_times(numeric, n=1):
        return np.nan_to_num(numeric).sum(dtype=np.float) * n

    SumNTimes = make_agg_primitive(
        function=sum_n_times,
        input_types=[ColumnSchema(semantic_tags={'numeric'})],
        return_type=ColumnSchema(semantic_tags={'numeric'}))

    sum_n_1_n = 1
    sum_n_1_base_f = ft.Feature(es['log'].ww['value'])
    sum_n_1 = ft.Feature([sum_n_1_base_f],
                         parent_dataframe_name='sessions',
                         primitive=SumNTimes(n=sum_n_1_n))
    sum_n_2_n = 2
    sum_n_2_base_f = ft.Feature(es['log'].ww['value_2'])
    sum_n_2 = ft.Feature([sum_n_2_base_f],
                         parent_dataframe_name='sessions',
                         primitive=SumNTimes(n=sum_n_2_n))
    assert sum_n_1_base_f == sum_n_1.base_features[0]
    assert sum_n_1_n == sum_n_1.primitive.kwargs['n']
    assert sum_n_2_base_f == sum_n_2.base_features[0]
    assert sum_n_2_n == sum_n_2.primitive.kwargs['n']

Beispiel #19

0

Datei anzeigen

Datei: test_agg_feats.py Projekt: rgolovnya/featuretools

def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self):
        where_str = self._where_str()
        use_prev_str = self._use_prev_str()
        return u"COUNT(%s%s%s)" % (self.child_entity.id,
                                   where_str,
                                   use_prev_str)

    Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric,
                               name="count", stack_on_self=False,
                               cls_attributes={"generate_name": count_generate_name})
    count_null = Count(es['log']['value'], es['sessions'], count_null=True)
    feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()

Beispiel #20

0

Datei anzeigen

def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """
    # TODO: Update to work with Dask and Koalas
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail(
            "Cannot use primitives made with make_agg_primitives with Dask or Koalas EntitySets"
        )
    # test with normally defined functions

    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(
        custom_primitive,
        input_types=[ColumnSchema(semantic_tags={'numeric'})],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(
        custom_primitive,
        input_types=[ColumnSchema(semantic_tags={'numeric'})],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        name="max")

    f_sum = ft.Feature(es["log"].ww["value"],
                       parent_dataframe_name="customers",
                       primitive=Sum)
    f_max = ft.Feature(es["log"].ww["value"],
                       parent_dataframe_name="customers",
                       primitive=Max)

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(
        lambda x: x.sum(),
        input_types=[ColumnSchema(semantic_tags={'numeric'})],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        name="sum")
    Max = make_agg_primitive(
        lambda x: x.max(),
        input_types=[ColumnSchema(semantic_tags={'numeric'})],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        name="max")

    f_sum = ft.Feature(es["log"].ww["value"],
                       parent_dataframe_name="customers",
                       primitive=Sum)
    f_max = ft.Feature(es["log"].ww["value"],
                       parent_dataframe_name="customers",
                       primitive=Max)
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

Beispiel #21

0

Datei anzeigen

Datei: custom_sum.py Projekt: john-rice/featuretools

from woodwork.column_schema import ColumnSchema

from featuretools.primitives.base import make_agg_primitive

CustomSum = make_agg_primitive(
    lambda x: sum(x),
    name="CustomSum",
    input_types=[ColumnSchema(semantic_tags={'numeric'})],
    return_type=ColumnSchema(semantic_tags={'numeric'}))

Beispiel #22

0

Datei anzeigen

Datei: custom_mean.py Projekt: zwcdp/featuretools

from featuretools.primitives.base import make_agg_primitive
from featuretools.variable_types import Numeric

CustomMean = make_agg_primitive(lambda x: sum(x) / len(x),
                                name="CustomMean",
                                input_types=[Numeric],
                                return_type=Numeric)

Beispiel #23

0

Datei anzeigen

from featuretools.primitives.base import make_agg_primitive
from featuretools.variable_types import Numeric

CustomMax = make_agg_primitive(lambda x: max(x),
                               name="CustomMax",
                               input_types=[Numeric],
                               return_type=Numeric)

Beispiel #24

0

Datei anzeigen

from featuretools.primitives.base import make_agg_primitive
from featuretools.variable_types import Numeric

CustomSum = make_agg_primitive(lambda x: sum(x),
                               name="CustomSum",
                               input_types=[Numeric],
                               return_type=Numeric)