Ejemplo n.º 1
0
def test_time_since_last_custom(es):
    def time_since_last(values, time=None):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                                       Numeric,
                                       name="time_since_last",
                                       uses_calc_time=True)
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = ft.calculate_feature_matrix([f],
                                     entityset=es,
                                     instance_ids=[0, 1, 2],
                                     cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        TimeSinceLast = make_agg_primitive(time_since_last,
                                           [DatetimeTimeIndex],
                                           Numeric,
                                           uses_calc_time=False)
Ejemplo n.º 2
0
def test_custom_primitive_time_as_arg(es):
    def time_since_last(values, time):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last,
                                       [DatetimeTimeIndex],
                                       Numeric,
                                       uses_calc_time=True)
    assert TimeSinceLast.name == "time_since_last"
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = calculate_feature_matrix([f],
                                  entityset=es,
                                  instance_ids=[0, 1, 2],
                                  cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    with pytest.raises(ValueError):
        make_agg_primitive(time_since_last,
                           [DatetimeTimeIndex],
                           Numeric,
                           uses_calc_time=False)
Ejemplo n.º 3
0
def test_custom_primitive_time_as_arg(es):
    def time_since_last(values, time):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last,
                                       [DatetimeTimeIndex],
                                       Numeric,
                                       uses_calc_time=True)
    assert TimeSinceLast.name == "time_since_last"
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = calculate_feature_matrix([f],
                                  entityset=es,
                                  instance_ids=[0, 1, 2],
                                  cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    with pytest.raises(ValueError):
        make_agg_primitive(time_since_last,
                           [DatetimeTimeIndex],
                           Numeric,
                           uses_calc_time=False)
Ejemplo n.º 4
0
def get_feature_matrix(df, n_jobs=1, verbose=True):
    es = ft.EntitySet('safety_data')
    es.entity_from_dataframe(entity_id='records',
                             index='id',
                             make_index=True,
                             dataframe=df,
                             variable_types={
                                 'Accuracy': vtypes.Numeric,
                                 'Bearing': vtypes.Numeric,
                                 'acceleration_x': vtypes.Numeric,
                                 'acceleration_y': vtypes.Numeric,
                                 'acceleration_z': vtypes.Numeric,
                                 'gyro_x': vtypes.Numeric,
                                 'gyro_y': vtypes.Numeric,
                                 'gyro_z': vtypes.Numeric,
                                 'second': vtypes.Numeric,
                                 'Speed': vtypes.Numeric,
                             })

    es.normalize_entity(base_entity_id='records',
                        new_entity_id='bookings',
                        index='bookingID')

    return ft.dfs(entityset=es,
                  target_entity='bookings',
                  agg_primitives=[
                      make_agg_primitive(function=mean_diff,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=max_diff,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=min_diff,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=std_diff,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=mean_diff_abs,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=max_diff_abs,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=min_diff_abs,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      make_agg_primitive(function=std_diff_abs,
                                         input_types=[Numeric],
                                         return_type=Numeric),
                      'count',
                      'mean',
                      'max',
                      'min',
                      'std',
                  ],
                  n_jobs=n_jobs,
                  verbose=verbose)
 def _make_agg_primitives(self):
     self.days_since_last = make_agg_primitive(
         function=self._days_since_last,
         name='days_since_last',
         input_types=[DatetimeTimeIndex],
         return_type=Numeric,
         description="Time since last related instance",
         uses_calc_time=True)
     self.month_of_cutoff_point = make_agg_primitive(
         function=self._month_of_cutoff_point,
         name='month_of_cutoff_point',
         input_types=[DatetimeTimeIndex],
         return_type=Numeric,
         description="month_of_cutoff_point",
         uses_calc_time=True)
     self.user_defined_agg_primitives = ['month_of_cutoff_point']
Ejemplo n.º 6
0
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
Ejemplo n.º 8
0
def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self):
        where_str = self._where_str()
        use_prev_str = self._use_prev_str()
        return u"COUNT(%s%s%s)" % (self.child_entity.id, where_str,
                                   use_prev_str)

    Count = make_agg_primitive(
        count_func, [[Index], [Variable]],
        Numeric,
        name="count",
        stack_on_self=False,
        cls_attributes={"generate_name": count_generate_name})
    count_null = Count(es['log']['value'], es['sessions'], count_null=True)
    feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()
def test_pickle_features_with_custom_primitive(es):
    NewMax = make_agg_primitive(
        lambda x: max(x),
        name="NewMax",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")

    features_no_pickle = ft.dfs(target_entity='sessions',
                                entityset=es,
                                agg_primitives=["Last", "Mean", NewMax],
                                features_only=True)

    assert any(
        [isinstance(feat.primitive, NewMax) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
Ejemplo n.º 10
0
def test_custom_primitive_multiple_inputs(es):
    def mean_sunday(numeric, datetime):
        '''
        Finds the mean of non-null values of a feature that occurred on Sundays
        '''
        days = pd.DatetimeIndex(datetime).weekday.values
        df = pd.DataFrame({'numeric': numeric, 'time': days})
        return df[df['time'] == 6]['numeric'].mean()

    MeanSunday = make_agg_primitive(function=mean_sunday,
                                    input_types=[Numeric, Datetime],
                                    return_type=Numeric)

    fm, features = dfs(entityset=es,
                       target_entity="sessions",
                       agg_primitives=[MeanSunday],
                       trans_primitives=[])
    mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None])
    iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value)
    for x, y in iterator:
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

    es.add_interesting_values()
    mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None])
    fm, features = dfs(entityset=es,
                       target_entity="sessions",
                       agg_primitives=[MeanSunday],
                       trans_primitives=[],
                       where_primitives=[MeanSunday])
    where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)"
    for x, y in zip(fm[where_feat], mean_sunday_value_priority_0):
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
Ejemplo n.º 11
0
def test_custom_primitive_multiple_inputs(es):
    def mean_sunday(numeric, datetime):
        '''
        Finds the mean of non-null values of a feature that occurred on Sundays
        '''
        days = pd.DatetimeIndex(datetime).weekday.values
        df = pd.DataFrame({'numeric': numeric, 'time': days})
        return df[df['time'] == 6]['numeric'].mean()

    MeanSunday = make_agg_primitive(function=mean_sunday,
                                    input_types=[Numeric, Datetime],
                                    return_type=Numeric)

    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[])
    mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None])
    iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value)
    for x, y in iterator:
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

    es.add_interesting_values()
    mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None])
    fm, features = ft.dfs(entityset=es,
                          target_entity="sessions",
                          agg_primitives=[MeanSunday],
                          trans_primitives=[],
                          where_primitives=[MeanSunday])
    where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)"
    for x, y in zip(fm[where_feat], mean_sunday_value_priority_0):
        assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
Ejemplo n.º 12
0
def custom_aggregation(func, *args):
    """Takes custom aggregation function and returns it in a format usable by featuretools."""
    return make_agg_primitive(
        lambda x: func(x, *args),
        [Numeric],
        Numeric,
        func.__name__ + "_".join(str(i) for i in args),
    )
Ejemplo n.º 13
0
def test_warns_with_unused_custom_primitives(pd_es):
    def above_ten(column):
        return column > 10

    AboveTen = make_trans_primitive(function=above_ten,
                                    input_types=[Numeric],
                                    return_type=Numeric)

    trans_primitives = [AboveTen]

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  trans_primitives: ['above_ten']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=pd_es,
            target_entity='sessions',
            trans_primitives=trans_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='customers',
            trans_primitives=trans_primitives,
            max_depth=1)

    def max_above_ten(column):
        return max(column) > 10

    MaxAboveTen = make_agg_primitive(function=max_above_ten,
                                     input_types=[Numeric],
                                     return_type=Numeric)

    agg_primitives = [MaxAboveTen]

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  agg_primitives: ['max_above_ten']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=pd_es,
            target_entity='stores',
            agg_primitives=agg_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='sessions',
            agg_primitives=agg_primitives,
            max_depth=1)
Ejemplo n.º 14
0
def test_agg_same_method_name(es):
    """
        Pandas relies on the function name when calculating aggregations. This means if a two
        primitives with the same function name are applied to the same column, pandas
        can't differentiate them. We have a work around to this based on the name property
        that we test here.
    """

    # test with normally defined functions
    def custom_primitive(x):
        return x.sum()

    Sum = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")

    def custom_primitive(x):
        return x.max()

    Max = make_agg_primitive(custom_primitive,
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    Sum = make_agg_primitive(lambda x: x.sum(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="sum")
    Max = make_agg_primitive(lambda x: x.max(),
                             input_types=[Numeric],
                             return_type=Numeric,
                             name="max")

    f_sum = Sum(es["log"]["value"], es["customers"])
    f_max = Max(es["log"]["value"], es["customers"])
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
Ejemplo n.º 15
0
def test_pickle_features_with_custom_primitive(es):
    NewMax = make_agg_primitive(
        lambda x: max(x),
        name="NewMax",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")

    features_original = ft.dfs(target_entity='sessions', entityset=es,
                               agg_primitives=["Last", "Mean", NewMax], features_only=True)

    assert any([isinstance(feat.primitive, NewMax) for feat in features_original])
    pickle_features_test_helper(asizeof(es), features_original)
Ejemplo n.º 16
0
def test_pickle_features_with_custom_primitive(pd_es, tmpdir):
    NewMax = make_agg_primitive(
        lambda x: max(x),
        name="NewMax",
        input_types=[ColumnSchema(semantic_tags={'numeric'})],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        description="Calculate means ignoring nan values")

    features_original = ft.dfs(target_dataframe_name='sessions',
                               entityset=pd_es,
                               agg_primitives=["Last", "Mean", NewMax],
                               features_only=True)

    assert any(
        [isinstance(feat.primitive, NewMax) for feat in features_original])
    pickle_features_test_helper(asizeof(pd_es), features_original, str(tmpdir))
Ejemplo n.º 17
0
def test_custom_primitive_default_kwargs(es):
    def sum_n_times(numeric, n=1):
        return np.nan_to_num(numeric).sum(dtype=np.float) * n

    SumNTimes = make_agg_primitive(function=sum_n_times,
                                   input_types=[Numeric],
                                   return_type=Numeric)

    sum_n_1_n = 1
    sum_n_1_base_f = Feature(es['log']['value'])
    sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n)
    sum_n_2_n = 2
    sum_n_2_base_f = Feature(es['log']['value_2'])
    sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n)
    assert sum_n_1_base_f == sum_n_1.base_features[0]
    assert sum_n_1_n == sum_n_1.kwargs['n']
    assert sum_n_2_base_f == sum_n_2.base_features[0]
    assert sum_n_2_n == sum_n_2.kwargs['n']
Ejemplo n.º 18
0
def test_custom_primitive_default_kwargs(es):
    def sum_n_times(numeric, n=1):
        return np.nan_to_num(numeric).sum(dtype=np.float) * n

    SumNTimes = make_agg_primitive(function=sum_n_times,
                                   input_types=[Numeric],
                                   return_type=Numeric)

    sum_n_1_n = 1
    sum_n_1_base_f = Feature(es['log']['value'])
    sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n)
    sum_n_2_n = 2
    sum_n_2_base_f = Feature(es['log']['value_2'])
    sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n)
    assert sum_n_1_base_f == sum_n_1.base_features[0]
    assert sum_n_1_n == sum_n_1.kwargs['n']
    assert sum_n_2_base_f == sum_n_2.base_features[0]
    assert sum_n_2_n == sum_n_2.kwargs['n']
Ejemplo n.º 19
0
def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self):
        where_str = self._where_str()
        use_prev_str = self._use_prev_str()
        return u"COUNT(%s%s%s)" % (self.child_entity.name,
                                   where_str,
                                   use_prev_str)

    Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric,
                               name="count", stack_on_self=False,
                               cls_attributes={"generate_name": count_generate_name})
    count_null = Count(es['log']['value'], es['sessions'], count_null=True)
    feature_matrix = calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()
Ejemplo n.º 20
0
#
# feature_matrix2, feature_defs2 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=["count"],
#                                         trans_primitives=["month"], max_depth=1)
"""
自定义agg_primitives:
改写time since last,原函数为秒,现在改为小时输出
"""


def time_since_last_by_hour(values, time=None):
    time_since = time - values.iloc[-1]
    return time_since.total_seconds() / 3600


Time_since_last_by_hour = make_agg_primitive(function=time_since_last_by_hour,
                                             input_types=[DatetimeTimeIndex],
                                             return_type=Numeric,
                                             uses_calc_time=True)
"""
自定义trans_primitives:
添加log e 的自然对数
"""
import numpy as np


def log(vals):
    return np.log(vals)


# def generate_name(self, base_feature_names):
#     return "-(%s)" % (base_feature_names[0])
log = make_trans_primitive(
Ejemplo n.º 21
0
        end_flag = length // n * end
        # print(start_flag, end_flag)
        piece = new_s.iloc[start_flag:end_flag]
        # print(sum(piece))
        # print()
        if (sum(piece) > 0):
            count += 1
        start += 1
        end += 1
    return count


rise_count = make_agg_primitive(
    function=rise_count,
    input_types=[Numeric],
    return_type=Numeric,
    # uses_calc_time=True,
    description="Calculates the rise_count max of the value.",
    name="rise_count")

# %%
"""
# 生成新的特征融合矩阵
# 可以根据target_entity的不同生成不同的融合特征矩阵
"""
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    #   agg_primitives=["median", "count", "num_unique", "max","avg_time_between", "n_most_common", max2nd, max3rd],
    agg_primitives=[rise_count],
    trans_primitives=["month"],
Ejemplo n.º 22
0
from woodwork.column_schema import ColumnSchema

from featuretools.primitives import make_agg_primitive

CustomMax = make_agg_primitive(
    lambda x: max(x),
    name="CustomMax",
    input_types=[ColumnSchema(semantic_tags={'numeric'})],
    return_type=ColumnSchema(semantic_tags={'numeric'}))

CustomSum = make_agg_primitive(
    lambda x: sum(x),
    name="CustomSum",
    input_types=[ColumnSchema(semantic_tags={'numeric'})],
    return_type=ColumnSchema(semantic_tags={'numeric'}))
Ejemplo n.º 23
0
def get_results(request):
    try:
        import featuretools as ft
        import pandas as pd
        import numpy as np
        from featuretools.primitives import make_trans_primitive, make_agg_primitive

        # 数据源相关的参数
        types_dict = eval(request.COOKIES['types_dict'])
        columns_dict = eval(request.COOKIES['columns_dict'])
        target = request.COOKIES['target']

        # 如何决定 base entity?
        # 目前思路是由 id 类型最多的 entity 来做 base entity
        # 把对应的表和id个数封装成字典,然后根据个数给表名排逆序,然后按照这个顺序merge表,是为最终思路
        base_entity = ''
        base_index = ''

        max_count = 0
        sorted_dict = {}
        for k, v in types_dict.items():
            count = 0

            index = ''
            for i in v:
                if '.Id' in str(i):
                    count += 1
                if '.Index' in str(i):
                    index = i
            sorted_dict[k] = count
            if count > max_count:
                base_entity = k
                base_index = index
                max_count = count
        sorted_list = sorted(sorted_dict.items(),
                             key=lambda item: item[1],
                             reverse=True)
        sorted_table_name = [i[0] for i in sorted_list]

        print("sorted_table_name\n", sorted_table_name)

        # 把columns 和对应的 类型拼接成字典,存在一个列表中,并且找到base_index
        types_dict_list = []
        entity_name_list = []
        for key, values1, values2 in zip(columns_dict.keys(),
                                         columns_dict.values(),
                                         types_dict.values()):
            types_dict_list.append(
                {k: eval(v)
                 for k, v in zip(values1, values2)})
            entity_name_list.append(key)
            if key == base_entity:
                for k, v in zip(values2, values1):
                    if '.Index' in k:
                        base_index = v

        # 自动识别标记为Index的特征,并作为抽取实体的index参数,传入模型
        # 把所有的类型字典拼成一个大字典
        index_list = []
        total_type_dict = {}
        for each_dict in types_dict_list:
            total_type_dict.update(each_dict)
            for k, v in each_dict.items():
                if '.Index' in str(v):
                    index_list.append(k)
        print(index_list)
        # print(total_type_dict)

        # 原表全部join在一起之后再抽取实体
        # 数据接口改成处理CSV结构
        import os
        import re
        if not os.path.isdir(os.getcwd() + "/demo_data"):
            os.mkdir(os.getcwd() + "/demo_data")
        os.chdir(os.getcwd() + "/demo_data")
        regex = re.compile("csv")
        raw_dict = {}

        for file in os.listdir(os.getcwd()):
            if re.search(regex, file):
                raw_dict[file.split(".")[0]] = pd.read_csv(file)

        data = raw_dict
        os.chdir("..")

        # todo : merge的逻辑比较复杂,要如何执行join操作??
        if len(data) == 0:
            raise Exception("数据源为空,请检查数据源文件")
        elif len(data) > 1:
            data_df = data.pop(sorted_table_name.pop(0))
            # print(data_df)
            for i in sorted_table_name:
                data_df = data_df.merge(data[i])
            #
            # for i in list(data.values()):
            #     data_df = data_df.merge(i)

        elif len(data) == 1:
            data_df = list(data.values())[0]
        es = ft.EntitySet()

        # print("+++++++++++++++++++++++")
        # print("data_df\n", data_df)
        # print("entity_id\n", base_entity)
        # print("base_index\n", base_index)
        # print("total_type_dict\n", total_type_dict)
        # print("+++++++++++++++++++++++")
        # 构造base entity, 将第一个表名作为基础实体名称
        es = es.entity_from_dataframe(
            entity_id=base_entity,
            dataframe=data_df,
            index=base_index,
            # time_index="transaction_time",
            variable_types=total_type_dict)

        # 基于base entity抽取实体,逻辑比较复杂,基本逻辑是作为base entity的字段,跳过实体抽取,其余的将index 字段单独存储,设为index参数
        for k, v in columns_dict.items():
            if k == base_entity:
                continue
            index = ''
            for i in index_list:
                if i in v:
                    v.remove(i)
                    index = i
            # print("=========")
            # print(k)
            # print(index)
            # print(v)
            # print("=========")
            es = es.normalize_entity(
                base_entity_id=base_entity,
                new_entity_id=k,
                index=index,
                # make_time_index="session_start",
                additional_variables=v)
        """
        自定义agg_primitives:
        改写time since last,原函数为秒,现在改为小时输出
        """
        def time_since_last_by_hour(values, time=None):
            time_since = time - values.iloc[-1]
            return time_since.total_seconds() / 3600

        Time_since_last_by_hour = make_agg_primitive(
            function=time_since_last_by_hour,
            input_types=[ft.variable_types.DatetimeTimeIndex],
            return_type=ft.variable_types.Numeric,
            uses_calc_time=True)
        """
        自定义trans_primitives:
        添加log e 的自然对数
        """
        import numpy as np

        def log(vals):
            return np.log(vals)

        # def generate_name(self, base_feature_names):
        #     return "-(%s)" % (base_feature_names[0])
        log = make_trans_primitive(
            function=log,
            input_types=[ft.variable_types.Numeric],
            return_type=ft.variable_types.Numeric,
            # uses_calc_time=True,
            description="Calculates the log of the value.",
            name="log")
        """
        自定义trans_primitives:
        判断是否为正数
        """
        import numpy as np

        def is_positive(vals):
            return vals > 0

        # def generate_name(self, base_feature_names):
        #     return "-(%s)" % (base_feature_names[0])
        is_positive = make_trans_primitive(
            function=is_positive,
            input_types=[ft.variable_types.Numeric],
            return_type=ft.variable_types.Boolean,
            # uses_calc_time=True,
            description="Calculates if the value positive.",
            name="is_positive")

        # 模型相关的参数
        max_depth = request.POST['max_depth']
        agg_pri = request.POST.getlist('agg_pri')
        agg_pri_customer = request.POST.getlist('agg_pri_customer')
        trans_pri_customer = request.POST.getlist('trans_pri_customer')
        trans_pri = request.POST.getlist('trans_pri')
        context = {
            'max_depth': max_depth,
            'agg_pri': agg_pri,
            'trans_pri': trans_pri
        }

        pd.set_option('display.max_columns', 20)

        # 将前端页面的提交参数,保存为agg_pri列表
        agg_pri = context['agg_pri']
        trans_pri = context['trans_pri']
        print(trans_pri_customer)
        # 如果勾选了参数,加上自定义的Time_since_last_by_hour
        if 'Time_since_last_by_hour' in agg_pri_customer:
            agg_pri.append(Time_since_last_by_hour)
        if 'log_e' in trans_pri_customer:
            trans_pri.append(log)
        if 'is_positive' in trans_pri_customer:
            trans_pri.append(is_positive)
        print("+++++++++++++++++++++++++++++")

        print(trans_pri)
        print("+++++++++++++++++++++++++++++")
        # 生成新的特征融合矩阵
        feature_matrix, feature_defs = ft.dfs(entityset=es,
                                              target_entity=target,
                                              agg_primitives=agg_pri,
                                              trans_primitives=trans_pri,
                                              max_depth=int(
                                                  context['max_depth']))

        # 将索引作为第一列插入数据矩阵
        feature_matrix = feature_matrix.reset_index()
        new_columns = feature_matrix.columns

        # 保存数据矩阵,注意在特征选择界面,没有 customer_id 作为选项,因为这只是索引
        # nlp 数组是将primitives替换为中文后的表头,一并显示在第二行
        import os
        if not os.path.isdir(os.getcwd() + "/demo_data/result"):
            os.mkdir(os.getcwd() + "/demo_data/result")
        feature_matrix.to_csv("./demo_data/result/all_features.csv",
                              index=False)
        # print(feature_matrix.head(5))
        from .columns2NLP import columns2NLP
        res = []
        nlp = []
        for i in new_columns:
            res.append(str(i))
            nlp.append(columns2NLP(str(i)))
        # print(res[0])
        # print("======================")
        # print(res)
        # print(nlp)
        # print("======================")
        # 将所有的浮点数精度调整到小数点后两位
        sample_data1 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[0]
        ]
        sample_data2 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[1]
        ]
        sample_data3 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[2]
        ]
        sample_data4 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[3]
        ]
        sample_data5 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[4]
        ]
        response = render(
            request, 'get_results.html', {
                'res': res,
                'nlp': nlp,
                'sample_data1': sample_data1,
                'sample_data2': sample_data2,
                'sample_data3': sample_data3,
                'sample_data4': sample_data4,
                'sample_data5': sample_data5
            })
        response.set_cookie('target_id', res[0])
        return response

    except Exception as e:
        response = render(request, 'erro.html', {'erro': e})
        return response
Ejemplo n.º 24
0
def get_results(request):
    max_depth = request.POST['max_depth']
    agg_pri = request.POST.getlist('agg_pri')
    agg_pri_customer = request.POST.getlist('agg_pri_customer')
    trans_pri_customer = request.POST.getlist('trans_pri_customer')
    trans_pri = request.POST.getlist('trans_pri')

    context = {'max_depth': max_depth, 'agg_pri': agg_pri, 'trans_pri': trans_pri}

    import featuretools as ft
    import pandas as pd
    import numpy as np
    from featuretools.primitives import make_trans_primitive, make_agg_primitive
    from featuretools.variable_types import DatetimeTimeIndex, Numeric

    pd.set_option('display.max_columns', 20)
    data = ft.demo.load_mock_customer()
    transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])
    products_df = data["products"]

    es = ft.EntitySet()
    s = es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_df, index="transaction_id",
                                 time_index="transaction_time",
                                 variable_types={"product_id": ft.variable_types.Categorical,
                                                 "zip_code": ft.variable_types.ZIPCode})

    es = es.entity_from_dataframe(entity_id="products", dataframe=products_df, index="product_id")

    new_relationship = ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"])

    es = es.add_relationship(new_relationship)

    es = es.normalize_entity(base_entity_id="transactions",
                             new_entity_id="sessions",
                             index="session_id",
                             make_time_index="session_start",
                             additional_variables=["device", "customer_id", "zip_code", "session_start", "join_date"])

    es = es.normalize_entity(base_entity_id="sessions",
                             new_entity_id="customers",
                             index="customer_id",
                             make_time_index="join_date",
                             additional_variables=["zip_code", "join_date"])

    # feature_matrix1, feature_defs1 = ft.dfs(entityset=es, target_entity="products")
    #
    # feature_matrix2, feature_defs2 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=["count"],
    #                                         trans_primitives=["month"], max_depth=1)

    """
    自定义agg_primitives:
    改写time since last,原函数为秒,现在改为小时输出
    """
    def time_since_last_by_hour(values, time=None):
        time_since = time - values.iloc[-1]
        return time_since.total_seconds() / 3600

    Time_since_last_by_hour = make_agg_primitive(function=time_since_last_by_hour,
                                                 input_types=[DatetimeTimeIndex],
                                                 return_type=Numeric,
                                                 uses_calc_time=True)

    """
    自定义trans_primitives:
    添加log e 的自然对数
    """
    import numpy as np

    def log(vals):
        return np.log(vals)

    # def generate_name(self, base_feature_names):
    #     return "-(%s)" % (base_feature_names[0])
    log = make_trans_primitive(function=log,
                               input_types=[Numeric],
                               return_type=Numeric,
                               # uses_calc_time=True,
                               description="Calculates the log of the value.",
                               name="log")

    # 将前端页面的提交参数,保存为agg_pri列表
    agg_pri = context['agg_pri']
    trans_pri = context['trans_pri']

    # 如果勾选了参数,加上自定义的Time_since_last_by_hour
    if 'Time_since_last_by_hour' in agg_pri_customer:
        agg_pri.append(Time_since_last_by_hour)
    if 'log_e' in trans_pri_customer:
        trans_pri.append(log)
    # 生成新的特征融合矩阵
    feature_matrix3, feature_defs3 = ft.dfs(entityset=es, target_entity="customers",
                                            agg_primitives=agg_pri,
                                            trans_primitives=trans_pri,
                                            max_depth=int(context['max_depth']))
    res = []
    for i in feature_defs3:
        res.append(str(i))

    sample_data = [i for i in feature_matrix3.iloc[0]]
    return render(request, 'get_results.html', {'res': res, 'sample_data': sample_data})
Ejemplo n.º 25
0
def absolute(column):
    return abs(column)


Absolute = make_trans_primitive(function=absolute,
                                input_types=[Numeric],
                                return_type=Numeric)


def maximum(columns):
    return max(columns)


Maximum = make_agg_primitive(function=maximum,
                             input_types=[Numeric],
                             return_type=Numeric)


#Multiple Input Types
def mean_numeric(num1, num2):
    return (num1 + num2) / 2


Meanval = make_trans_primitive(function=mean_numeric,
                               input_types=[Numeric, Numeric],
                               return_type=Numeric)
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity='transactions',
    trans_primitives=[
Ejemplo n.º 26
0
                sec.add(i)
        full.add(i)
    return repeat / len(set(column))


def repeat_percent(column):
    a = set(column)
    return len(a) / len(column)


def count_set_length(column):
    a = set(column)
    return len(a)


cunt_rpt = make_agg_primitive(function=count_repeat, input_types=[Categorical], return_type=Numeric)
CountDay = make_agg_primitive(function=count_set_length, input_types=[ft.variable_types.Datetime], return_type=Numeric)
RepeatPercent = make_agg_primitive(function=repeat_percent, input_types=[Categorical], return_type=Numeric)

log_df = get_train_log(None)
log_df = log_df.loc[log_df['action_type'] == 2]
log_df["user_seller"] = np.add(np.array(log_df["user_id"].map(lambda x: str(x) + "_")),
                               np.array(log_df["seller_id"].map(lambda x: str(x))))
log_df['data'] = log_df["time_stamp"].map(lambda x: '2016-' + str(int(x / 100)) + '-' + str(int(x // 100)))
log_df["month"] = log_df["time_stamp"].map(lambda x: int(x / 100))
user_df = get_user_info()
log_df = log_df.merge(user_df, on="user_id", how="inner")
log_df.drop(labels=['user_id', 'seller_id', 'action_type', 'age_range', 'gender'], axis=1, inplace=True)
log_df["index"] = log_df.index
es = ft.EntitySet(id="logs")
es = es.entity_from_dataframe(entity_id="logs",
def my_primitives():
    
    def gmean(x):
        return stats.gmean(np.absolute(list(filter(lambda a: a != 0, x))))

    def hmean(x):
        return stats.hmean(np.absolute(list(filter(lambda a: a != 0, x))))
    
    def kstatvar1(x):
        return stats.kstatvar(x, 1)
    
    def kstat2(x):
        return stats.kstat(x, 2)
    
    def kstatvar2(x):
        return stats.kstatvar(x, 2)
    
    def kstat3(x):
        return stats.kstat(x, 3)
    
    def kstat4(x):
        return stats.kstat(x, 4)
    
    def avg_change(x):
        return np.mean(np.diff(x))
    
    def avg_change_rate(x):
        return np.mean(np.nonzero((np.diff(x) / x[:-1]))[0])
    
    def range_func(x):
        return max(x)-min(x)
    
    def std_first_50000(x):
        return x[:50000].std()
    
    def std_last_50000(x):
        return x[-50000:].std()
    
    def std_first_10000(x):
        return x[:10000].std()
    
    def std_last_10000(x):
        return x[-10000:].std()
    
    def avg_first_50000(x):
        return x[:50000].mean()
    
    def avg_last_50000(x):
        return x[-50000:].mean()
    
    def avg_first_10000(x):
        return x[:10000].mean()
    
    def avg_last_10000(x):
        return x[-10000:].mean()
    
    def min_first_50000(x):
        return x[:50000].min()
    
    def min_last_50000(x):
        return x[-50000:].min()
    
    def min_first_10000(x):
        return x[:10000].min()
    
    def min_last_10000(x):
        return x[-10000:].min()
    
    def max_first_50000(x):
        return x[:50000].max()
    
    def max_last_50000(x):
        return x[-50000:].max()
    
    def max_first_10000(x):
        return x[:10000].max()
    
    def max_last_10000(x):
        return x[-10000:].max()
    
    def max_to_min(x):
        return x.max() / np.abs(x.min())
    
    def count_big(x):
        return len(x[np.abs(x) > 500])
    
    def sum_func(x):
        return x.sum()
    
    def avg_change_rate_first_50000(x):
        return np.mean(np.nonzero((np.diff(x[:50000]) / x[:50000][:-1]))[0])
    
    def avg_change_rate_last_50000(x):
        return np.mean(np.nonzero((np.diff(x[-50000:]) / x[-50000:][:-1]))[0])
    
    def avg_change_rate_first_10000(x):
        return np.mean(np.nonzero((np.diff(x[:10000]) / x[:10000][:-1]))[0])
    
    def avg_change_rate_last_10000(x):
        return np.mean(np.nonzero((np.diff(x[-10000:]) / x[-10000:][:-1]))[0])
    
    def q95(x):
        return np.quantile(x, 0.95)
    
    def q99(x):
        return np.quantile(x, 0.99)
    
    def q05(x):
        return np.quantile(x, 0.05)
    
    def q01(x):
        return np.quantile(x, 0.01)
    
    def abs_q95(x):
        return np.quantile(np.abs(x), 0.95)
    
    def abs_q99(x):
        return np.quantile(np.abs(x), 0.99)
    
    def add_trend_feature(arr, abs_values=False):
        idx = np.array(range(len(arr)))
        lr = LinearRegression()
        lr.fit(idx.reshape(-1, 1), arr)
        return lr.coef_[0]
    
    def add_trend_feature_abs(arr):
        idx = np.array(range(len(arr)))
        lr = LinearRegression()
        lr.fit(idx.reshape(-1, 1), np.abs(arr))
        return lr.coef_[0]

    def abs_mean(x):
        return np.abs(x).mean()
    
    def abs_std(x):
        return np.abs(x).std()
    
    def mad(x):
        return x.mad()
    
    def kurt(x):
        return x.kurtosis()
    
    def skew(x):
        return x.skew()
    
    def med(x):
        return x.median()
    
    def Hilbert_mean(x):
        return np.abs(hilbert(x)).mean()
    
    def Hann_window_mean(x):
        return (np.convolve(x, hann(150), mode='same') / sum(hann(150))).mean()
    
    def classic_sta_lta(x, length_sta, length_lta):
        sta = np.cumsum(x ** 2)
        # Convert to float
        sta = np.require(sta, dtype=np.float)
        # Copy for LTA
        lta = sta.copy()
        # Compute the STA and the LTA
        sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
        sta /= length_sta
        lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
        lta /= length_lta
        # Pad zeros
        sta[:length_lta - 1] = 0
        # Avoid division by zero by setting zero values to tiny float
        dtiny = np.finfo(0.0).tiny
        idx = lta < dtiny
        lta[idx] = dtiny
        return sta / lta
    
    def classic_sta_lta1_mean(x):
        return  classic_sta_lta(x, 500, 10000).mean()

    def classic_sta_lta2_mean(x):
        return classic_sta_lta(x, 5000, 100000).mean()
    
    def classic_sta_lta3_mean(x):
        return classic_sta_lta(x, 3333, 6666).mean()
    
    def classic_sta_lta4_mean(x):
        return classic_sta_lta(x, 10000, 25000).mean()
    
    def Moving_average_700_mean(x):
        return x.rolling(window=700).mean().mean(skipna=True)
    
    def Moving_average_1500_mean(x):
        return x.rolling(window=1500).mean().mean(skipna=True)
    
    def Moving_average_3000_mean(x):
        return x.rolling(window=3000).mean().mean(skipna=True)
    
    def Moving_average_6000_mean(x):
        return x.rolling(window=6000).mean().mean(skipna=True)
    
    def exp_Moving_average_300_mean(x):
        return (pd.Series.ewm(x, span=300).mean()).mean(skipna=True)
    
    def exp_Moving_average_3000_mean(x):
        return (pd.Series.ewm(x, span=3000).mean()).mean(skipna=True)
    
    def exp_Moving_average_30000_mean(x):
        return (pd.Series.ewm(x, span=30000).mean()).mean(skipna=True)

    def iqr(x):
        return np.subtract(*np.percentile(x, [75, 25]))
    
    def q999(x):
        return np.quantile(x, 0.999)
    
    def q001(x):
        return np.quantile(x, 0.001)
    
    def ave10(x):
        return  stats.trim_mean(x, 0.1)  
    
    def ave_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return x_roll_std.mean()
    
    def std_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return x_roll_std.std()
    
    def max_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return x_roll_std.max()
    
    def min_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return x_roll_std.min()

    def q01_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return np.quantile(x_roll_std, 0.01)
    
    def q05_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return np.quantile(x_roll_std, 0.05)
    
    def q95_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return np.quantile(x_roll_std, 0.95)
    
    def q99_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return np.quantile(x_roll_std, 0.99)

    def av_change_abs_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return np.mean(np.diff(x_roll_std))
    
    def av_change_rate_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
    
    def abs_max_roll_std_10(x):
        x_roll_std = x.rolling(10).std().dropna().values
        return  np.abs(x_roll_std).max()
    
    def std_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return x_roll_mean.std()

    def max_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return x_roll_mean.max()
    
    def min_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return x_roll_mean.min()
    
    def q01_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.quantile(x_roll_mean, 0.01)
    
    def q05_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.quantile(x_roll_mean, 0.05)

    def q95_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.quantile(x_roll_mean, 0.95)
    
    def q99_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.quantile(x_roll_mean, 0.99)
    
    def av_change_abs_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.mean(np.diff(x_roll_mean))
    
    def av_change_rate_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])

    def abs_max_roll_mean_10(x):
        x_roll_mean = x.rolling(10).mean().dropna().values
        return np.abs(x_roll_mean).max()
    
    def ave_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return x_roll_std.mean()
    
    def std_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return x_roll_std.std()
    
    def max_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return x_roll_std.max()
    
    def min_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return x_roll_std.min()

    def q01_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return np.quantile(x_roll_std, 0.01)
    
    def q05_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return np.quantile(x_roll_std, 0.05)
    
    def q95_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return np.quantile(x_roll_std, 0.95)
    
    def q99_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return np.quantile(x_roll_std, 0.99)

    def av_change_abs_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return np.mean(np.diff(x_roll_std))
    
    def av_change_rate_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
    
    def abs_max_roll_std_100(x):
        x_roll_std = x.rolling(100).std().dropna().values
        return  np.abs(x_roll_std).max()
    
    def std_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return x_roll_mean.std()

    def max_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return x_roll_mean.max()
    
    def min_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return x_roll_mean.min()
    
    def q01_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.quantile(x_roll_mean, 0.01)
    
    def q05_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.quantile(x_roll_mean, 0.05)

    def q95_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.quantile(x_roll_mean, 0.95)
    
    def q99_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.quantile(x_roll_mean, 0.99)
    
    def av_change_abs_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.mean(np.diff(x_roll_mean))
    
    def av_change_rate_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])

    def abs_max_roll_mean_100(x):
        x_roll_mean = x.rolling(100).mean().dropna().values
        return np.abs(x_roll_mean).max()
    
    def ave_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return x_roll_std.mean()
    
    def std_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return x_roll_std.std()
    
    def max_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return x_roll_std.max()
    
    def min_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return x_roll_std.min()

    def q01_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return np.quantile(x_roll_std, 0.01)
    
    def q05_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return np.quantile(x_roll_std, 0.05)
    
    def q95_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return np.quantile(x_roll_std, 0.95)
    
    def q99_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return np.quantile(x_roll_std, 0.99)

    def av_change_abs_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return np.mean(np.diff(x_roll_std))
    
    def av_change_rate_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
    
    def abs_max_roll_std_1000(x):
        x_roll_std = x.rolling(1000).std().dropna().values
        return  np.abs(x_roll_std).max()
    
    def std_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return x_roll_mean.std()

    def max_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return x_roll_mean.max()
    
    def min_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return x_roll_mean.min()
    
    def q01_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.01)
    
    def q05_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.05)

    def q95_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.95)
    
    def q99_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.99)
    
    def av_change_abs_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.mean(np.diff(x_roll_mean))
    
    def av_change_rate_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])

    def abs_max_roll_mean_1000(x):
        x_roll_mean = x.rolling(1000).mean().dropna().values
        return np.abs(x_roll_mean).max()

    def ave_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return x_roll_std.mean()
    
    def std_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return x_roll_std.std()
    
    def max_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return x_roll_std.max()
    
    def min_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return x_roll_std.min()

    def q01_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return np.quantile(x_roll_std, 0.01)
    
    def q05_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return np.quantile(x_roll_std, 0.05)
    
    def q95_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return np.quantile(x_roll_std, 0.95)
    
    def q99_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return np.quantile(x_roll_std, 0.99)

    def av_change_abs_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return np.mean(np.diff(x_roll_std))
    
    def av_change_rate_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
    
    def abs_max_roll_std_10000(x):
        x_roll_std = x.rolling(10000).std().dropna().values
        return  np.abs(x_roll_std).max()
    
    def std_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return x_roll_mean.std()

    def max_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return x_roll_mean.max()
    
    def min_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return x_roll_mean.min()
    
    def q01_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.01)
    
    def q05_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.05)

    def q95_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.95)
    
    def q99_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.quantile(x_roll_mean, 0.99)
    
    def av_change_abs_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.mean(np.diff(x_roll_mean))
    
    def av_change_rate_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])

    def abs_max_roll_mean_10000(x):
        x_roll_mean = x.rolling(10000).mean().dropna().values
        return np.abs(x_roll_mean).max()
    
    kstat2_pr = make_agg_primitive(function = kstat2,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    kstatvar1_pr = make_agg_primitive(function = kstatvar1,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    kstatvar2_pr = make_agg_primitive(function = kstatvar2,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    kstat3_pr = make_agg_primitive(function = kstat3,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    kstat4_pr = make_agg_primitive(function = kstat4,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    gmean_pr = make_agg_primitive(function = gmean,
                              input_types = [Numeric],
                              return_type = Numeric)  
    
    hmean_pr = make_agg_primitive(function = hmean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_change_pr = make_agg_primitive(function = avg_change,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_change_rate_pr = make_agg_primitive(function = avg_change_rate,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    range_pr = make_agg_primitive(function = range_func,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_first_50000_pr = make_agg_primitive(function = std_first_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_last_50000_pr = make_agg_primitive(function = std_last_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_first_10000_pr = make_agg_primitive(function = std_first_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_last_10000_pr = make_agg_primitive(function = std_last_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_first_50000_pr = make_agg_primitive(function = avg_first_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_last_50000_pr = make_agg_primitive(function = avg_last_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_first_10000_pr = make_agg_primitive(function = avg_first_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_last_10000_pr = make_agg_primitive(function = avg_last_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_first_50000_pr = make_agg_primitive(function = min_first_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_last_50000_pr = make_agg_primitive(function = min_last_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_first_10000_pr = make_agg_primitive(function = min_first_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_last_10000_pr = make_agg_primitive(function = min_last_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_first_50000_pr = make_agg_primitive(function = max_first_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_last_50000_pr = make_agg_primitive(function = max_last_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_first_10000_pr = make_agg_primitive(function = max_first_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_last_10000_pr = make_agg_primitive(function = max_last_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_to_min_pr = make_agg_primitive(function = max_to_min,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    count_big_pr = make_agg_primitive(function = count_big,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    sum_func_pr = make_agg_primitive(function = sum_func,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_change_rate_first_50000_pr = make_agg_primitive(function = avg_change_rate_first_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_change_rate_last_50000_pr = make_agg_primitive(function = avg_change_rate_last_50000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_change_rate_first_10000_pr = make_agg_primitive(function = avg_change_rate_first_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    avg_change_rate_last_10000_pr = make_agg_primitive(function = avg_change_rate_last_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_pr = make_agg_primitive(function = q95,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_pr = make_agg_primitive(function = q99,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_pr = make_agg_primitive(function = q05,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_pr = make_agg_primitive(function = q01,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_q95_pr = make_agg_primitive(function = abs_q95,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_q99_pr = make_agg_primitive(function = abs_q99,
                              input_types = [Numeric],
                              return_type = Numeric)

    trend_pr = make_agg_primitive(function = add_trend_feature,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_trend_pr = make_agg_primitive(function = add_trend_feature_abs,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_mean_pr = make_agg_primitive(function = abs_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_std_pr = make_agg_primitive(function = abs_std,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    mad_pr = make_agg_primitive(function = mad,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    kurt_pr = make_agg_primitive(function = kurt,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    skew_pr = make_agg_primitive(function = skew,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    med_pr = make_agg_primitive(function = med,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    Hilbert_mean_pr = make_agg_primitive(function = Hilbert_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    Hann_window_mean_pr = make_agg_primitive(function = Hann_window_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    classic_sta_lta1_mean_pr = make_agg_primitive(function = classic_sta_lta1_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    classic_sta_lta2_mean_pr = make_agg_primitive(function = classic_sta_lta2_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    classic_sta_lta3_mean_pr = make_agg_primitive(function = classic_sta_lta3_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    classic_sta_lta4_mean_pr = make_agg_primitive(function = classic_sta_lta4_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    Moving_average_700_mean_pr = make_agg_primitive(function = Moving_average_700_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    Moving_average_1500_mean_pr = make_agg_primitive(function = Moving_average_1500_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    Moving_average_3000_mean_pr = make_agg_primitive(function = Moving_average_3000_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    Moving_average_6000_mean_pr = make_agg_primitive(function = Moving_average_6000_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    exp_Moving_average_300_mean_pr = make_agg_primitive(function = exp_Moving_average_300_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    exp_Moving_average_3000_mean_pr = make_agg_primitive(function = exp_Moving_average_3000_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    exp_Moving_average_30000_mean_pr = make_agg_primitive(function = exp_Moving_average_30000_mean,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    iqr_pr = make_agg_primitive(function = iqr,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q999_pr = make_agg_primitive(function = q999,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q001 = make_agg_primitive(function = q001,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    ave10_pr = make_agg_primitive(function = ave10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    ave_roll_std_10_pr = make_agg_primitive(function = ave_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_std_10_pr = make_agg_primitive(function = std_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_std_10_pr = make_agg_primitive(function = max_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_std_10_pr = make_agg_primitive(function = min_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_std_10_pr = make_agg_primitive(function = q01_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_std_10_pr = make_agg_primitive(function = q05_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_std_10_pr = make_agg_primitive(function = q95_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_std_10_pr = make_agg_primitive(function = q99_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_std_10_pr = make_agg_primitive(function = av_change_abs_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_std_10_pr = make_agg_primitive(function = av_change_rate_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_std_10_pr = make_agg_primitive(function = abs_max_roll_std_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_mean_10_pr = make_agg_primitive(function = std_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_mean_10_pr = make_agg_primitive(function = max_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_mean_10_pr = make_agg_primitive(function = min_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_mean_10_pr = make_agg_primitive(function = q01_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_mean_10_pr = make_agg_primitive(function = q05_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_mean_10_pr = make_agg_primitive(function = q95_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_mean_10_pr = make_agg_primitive(function = q99_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_mean_10_pr = make_agg_primitive(function = av_change_abs_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_mean_10_pr = make_agg_primitive(function = av_change_rate_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_mean_10_pr = make_agg_primitive(function = abs_max_roll_mean_10,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    ave_roll_std_100_pr = make_agg_primitive(function = ave_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_std_100_pr = make_agg_primitive(function = std_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_std_100_pr = make_agg_primitive(function = max_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_std_100_pr = make_agg_primitive(function = min_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_std_100_pr = make_agg_primitive(function = q01_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_std_100_pr = make_agg_primitive(function = q05_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_std_100_pr = make_agg_primitive(function = q95_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_std_100_pr = make_agg_primitive(function = q99_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_std_100_pr = make_agg_primitive(function = av_change_abs_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_std_100_pr = make_agg_primitive(function = av_change_rate_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_std_100_pr = make_agg_primitive(function = abs_max_roll_std_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_mean_100_pr = make_agg_primitive(function = std_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_mean_100_pr = make_agg_primitive(function = max_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_mean_100_pr = make_agg_primitive(function = min_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_mean_100_pr = make_agg_primitive(function = q01_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_mean_100_pr = make_agg_primitive(function = q05_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_mean_100_pr = make_agg_primitive(function = q95_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_mean_100_pr = make_agg_primitive(function = q99_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_mean_100_pr = make_agg_primitive(function = av_change_abs_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_mean_100_pr = make_agg_primitive(function = av_change_rate_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_mean_100_pr = make_agg_primitive(function = abs_max_roll_mean_100,
                              input_types = [Numeric],
                              return_type = Numeric)

    ave_roll_std_1000_pr = make_agg_primitive(function = ave_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_std_1000_pr = make_agg_primitive(function = std_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_std_1000_pr = make_agg_primitive(function = max_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_std_1000_pr = make_agg_primitive(function = min_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_std_1000_pr = make_agg_primitive(function = q01_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_std_1000_pr = make_agg_primitive(function = q05_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_std_1000_pr = make_agg_primitive(function = q95_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_std_1000_pr = make_agg_primitive(function = q99_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_std_1000_pr = make_agg_primitive(function = av_change_abs_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_std_1000_pr = make_agg_primitive(function = av_change_rate_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_std_1000_pr = make_agg_primitive(function = abs_max_roll_std_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_mean_1000_pr = make_agg_primitive(function = std_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_mean_1000_pr = make_agg_primitive(function = max_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_mean_1000_pr = make_agg_primitive(function = min_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_mean_1000_pr = make_agg_primitive(function = q01_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_mean_1000_pr = make_agg_primitive(function = q05_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_mean_1000_pr = make_agg_primitive(function = q95_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_mean_1000_pr = make_agg_primitive(function = q99_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_mean_1000_pr = make_agg_primitive(function = av_change_abs_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_mean_1000_pr = make_agg_primitive(function = av_change_rate_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_mean_1000_pr = make_agg_primitive(function = abs_max_roll_mean_1000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    ave_roll_std_10000_pr = make_agg_primitive(function = ave_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_std_10000_pr = make_agg_primitive(function = std_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_std_10000_pr = make_agg_primitive(function = max_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_std_10000_pr = make_agg_primitive(function = min_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_std_10000_pr = make_agg_primitive(function = q01_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_std_10000_pr = make_agg_primitive(function = q05_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_std_10000_pr = make_agg_primitive(function = q95_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_std_10000_pr = make_agg_primitive(function = q99_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_std_10000_pr = make_agg_primitive(function = av_change_abs_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_std_10000_pr = make_agg_primitive(function = av_change_rate_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_std_10000_pr = make_agg_primitive(function = abs_max_roll_std_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    std_roll_mean_10000_pr = make_agg_primitive(function = std_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    max_roll_mean_10000_pr = make_agg_primitive(function = max_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    min_roll_mean_10000_pr = make_agg_primitive(function = min_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q01_roll_mean_10000_pr = make_agg_primitive(function = q01_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q05_roll_mean_10000_pr = make_agg_primitive(function = q05_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q95_roll_mean_10000_pr = make_agg_primitive(function = q95_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    q99_roll_mean_10000_pr = make_agg_primitive(function = q99_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_abs_roll_mean_10000_pr = make_agg_primitive(function = av_change_abs_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    av_change_rate_roll_mean_10000_pr = make_agg_primitive(function = av_change_rate_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)
    
    abs_max_roll_mean_10000_pr = make_agg_primitive(function = abs_max_roll_mean_10000,
                              input_types = [Numeric],
                              return_type = Numeric)

    return gmean_pr, hmean_pr, kstatvar1_pr, kstat2_pr, kstatvar2_pr, kstat3_pr, kstat4_pr, \
            avg_change_pr, avg_change_rate_pr, range_pr, std_first_50000_pr, \
            std_last_50000_pr, std_first_10000_pr, std_last_10000_pr, avg_first_50000_pr, \
            avg_last_50000_pr, avg_first_10000_pr, avg_last_10000_pr, min_first_50000_pr, \
            min_last_50000_pr, min_first_10000_pr, min_last_10000_pr, max_first_50000_pr, \
            max_last_50000_pr, max_first_10000_pr, max_last_10000_pr, max_to_min_pr, \
            count_big_pr, sum_func_pr, avg_change_rate_first_50000_pr, avg_change_rate_last_50000_pr, \
            avg_change_rate_first_10000_pr, avg_change_rate_last_10000_pr, q95_pr, \
            q99_pr, q05_pr, q01_pr, abs_q95_pr, abs_q99_pr, trend_pr, abs_trend_pr, \
            abs_mean_pr, abs_std_pr, mad_pr, kurt_pr, skew_pr, med_pr, Hilbert_mean_pr, \
            Hann_window_mean_pr, classic_sta_lta1_mean_pr, classic_sta_lta2_mean_pr, \
            classic_sta_lta3_mean_pr, classic_sta_lta4_mean_pr, Moving_average_700_mean_pr, \
            Moving_average_1500_mean_pr, Moving_average_3000_mean_pr, Moving_average_6000_mean_pr, \
            exp_Moving_average_300_mean_pr, exp_Moving_average_3000_mean_pr, \
            exp_Moving_average_30000_mean_pr, iqr_pr, q999_pr, q001, ave10_pr, \
            ave_roll_std_10_pr, std_roll_std_10_pr, max_roll_std_10_pr, min_roll_std_10_pr, \
            q01_roll_std_10_pr, q05_roll_std_10_pr, q95_roll_std_10_pr, q99_roll_std_10_pr, \
            av_change_abs_roll_std_10_pr, av_change_rate_roll_std_10_pr, abs_max_roll_std_10_pr, \
            std_roll_mean_10_pr, max_roll_mean_10_pr, min_roll_mean_10_pr, q01_roll_mean_10_pr, \
            q05_roll_mean_10_pr, q95_roll_mean_10_pr, q99_roll_mean_10_pr, \
            av_change_abs_roll_mean_10_pr, av_change_rate_roll_mean_10_pr, \
            abs_max_roll_mean_10_pr, ave_roll_std_100_pr, std_roll_std_100_pr, \
            max_roll_std_100_pr, min_roll_std_100_pr, q01_roll_std_100_pr, \
            q05_roll_std_100_pr, q95_roll_std_100_pr, q99_roll_std_100_pr, \
            av_change_abs_roll_std_100_pr, av_change_rate_roll_std_100_pr, \
            abs_max_roll_std_100_pr, std_roll_mean_100_pr, max_roll_mean_100_pr, \
            min_roll_mean_100_pr, q01_roll_mean_100_pr, q05_roll_mean_100_pr, \
            q95_roll_mean_100_pr, q99_roll_mean_100_pr, av_change_abs_roll_mean_100_pr, \
            av_change_rate_roll_mean_100_pr, abs_max_roll_mean_100_pr, ave_roll_std_1000_pr, \
            std_roll_std_1000_pr, max_roll_std_1000_pr, min_roll_std_1000_pr, \
            q01_roll_std_1000_pr, q05_roll_std_1000_pr, q95_roll_std_1000_pr, \
            q99_roll_std_1000_pr, av_change_abs_roll_std_1000_pr, \
            av_change_rate_roll_std_1000_pr, abs_max_roll_std_1000_pr, \
            std_roll_mean_1000_pr, max_roll_mean_1000_pr, min_roll_mean_1000_pr, \
            q01_roll_mean_1000_pr, q05_roll_mean_1000_pr, q95_roll_mean_1000_pr, \
            q99_roll_mean_1000_pr, av_change_abs_roll_mean_1000_pr, \
            av_change_rate_roll_mean_1000_pr, abs_max_roll_mean_1000_pr, \
            ave_roll_std_10000_pr, std_roll_std_10000_pr, max_roll_std_10000_pr, \
            min_roll_std_10000_pr, q01_roll_std_10000_pr, q05_roll_std_10000_pr, \
            q95_roll_std_10000_pr, q99_roll_std_10000_pr, av_change_abs_roll_std_10000_pr, \
            av_change_rate_roll_std_10000_pr, abs_max_roll_std_10000_pr, \
            std_roll_mean_10000_pr, max_roll_mean_10000_pr, min_roll_mean_10000_pr, \
            q01_roll_mean_10000_pr, q05_roll_mean_10000_pr, q95_roll_mean_10000_pr, \
            q99_roll_mean_10000_pr, av_change_abs_roll_mean_10000_pr, \
            av_change_rate_roll_mean_10000_pr, abs_max_roll_mean_10000_pr
Ejemplo n.º 28
0
        end_flag = length // n * end
        # print(start_flag, end_flag)
        piece = new_s.iloc[start_flag:end_flag]
        # print(sum(piece))
        # print()
        if (sum(piece) > 0):
            count += 1
        start += 1
        end += 1
    return count


rise_count = make_agg_primitive(
    function=rise_count,
    input_types=[Numeric],
    return_type=Numeric,
    # uses_calc_time=True,
    description="Calculates the rise_count max of the value.",
    name="rise_count")

# %%
"""
# 生成新的特征融合矩阵
# 可以根据target_entity的不同生成不同的融合特征矩阵
"""
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity="customers",
    #   agg_primitives=["median", "count", "num_unique", "max","avg_time_between", "n_most_common", max2nd, max3rd],
    agg_primitives=[rise_count],
    trans_primitives=["month"],
Ejemplo n.º 29
0
es.normalize_entity(base_entity_id='data',
                    new_entity_id='target',
                    index='RECHCT_USE_ITEPD_ID',
                    make_time_index=False,
                    additional_variables=target_col + ['target'])
from featuretools.variable_types import Numeric, PandasTypes
from featuretools.primitives import make_agg_primitive


def range_calc(numeric):
    return np.max(numeric) - np.min(numeric)


range_ = make_agg_primitive(function=range_calc,
                            input_types=[PandasTypes],
                            return_type=PandasTypes)


def p_corr_calc(numeric1, numeric2):
    return np.corrcoef(numeric1, numeric2)[0, 1]


pcorr_ = make_agg_primitive(function=p_corr_calc,
                            input_types=[PandasTypes, PandasTypes],
                            return_type=PandasTypes)


def s_corr_calc(numeric1, numeric2):
    return spearmanr(numeric1, numeric2)[0]
Ejemplo n.º 30
0
from featuretools.primitives import make_agg_primitive
from featuretools.variable_types import Numeric

CustomMax = make_agg_primitive(lambda x: max(x),
                               name="CustomMax",
                               input_types=[Numeric],
                               return_type=Numeric)

CustomSum = make_agg_primitive(lambda x: sum(x),
                               name="CustomSum",
                               input_types=[Numeric],
                               return_type=Numeric)
    x = x.to_frame()
    for i in range(10):
        clf.fit(x)
        frst = np.argmin(clf.covariances_, axis=0)
        scnd = abs(frst - 1)
        est = pd.DataFrame(
            data={
                'aic': [clf.aic(x)],
                'b': [clf.means_[frst][0]],
                'c': [clf.means_[scnd][0]],
                'd': [clf.covariances_[frst]],
                'e': [clf.covariances_[scnd]],
                'f': [clf.weights_[frst]],
                'g': [clf.weights_[scnd]]
            })
        if i == 0:
            features = est
        else:
            features = pd.concat([features, est])
    features = features.reset_index(drop=True)
    min_index = features['aic'].idxmin()
    features = features.iloc[min_index]
    return features[0], features[1], features[2], features[3], features[
        4], features[5], features[6]


GM_pr = make_agg_primitive(function=GM_fit,
                           input_types=[Numeric],
                           return_type=Numeric,
                           number_output_features=7)
Ejemplo n.º 32
0
    in an array of ['A', 'A', 'A', 'B', 'B'], the 
    function will return 0.6."""

    if x.mode().shape[0] == 0:
        return np.nan

    # Count occurence of each value
    counts = dict(Counter(x.values))
    # Find the mode
    mode = x.mode().iloc[0]
    # Divide the occurences of mode by the total occurrences
    return counts[mode] / np.sum(list(counts.values()))


NormalizedModeCount = make_agg_primitive(function=normalized_mode_count,
                                         input_types=[Discrete],
                                         return_type=Numeric)


# Function from https://codereview.stackexchange.com/a/15095
def longest_repetition(x):
    """
    Returns the item with most consecutive occurrences in `x`. 
    If there are multiple items with the same number of conseqcutive occurrences,
    it will return the first one. If `x` is empty, returns None. 
    """

    x = x.dropna()

    if x.shape[0] < 1:
        return None
Ejemplo n.º 33
0
    name = "mode"
    input_types = [Discrete]
    return_type = None

    def get_function(self):
        def pd_mode(x):
            if x.mode().shape[0] == 0:
                return np.nan
            return x.mode().iloc[0]

        return pd_mode


Min = make_agg_primitive(
    np.min, [Numeric],
    None,
    name="min",
    stack_on_self=False,
    description="Finds the minimum non-null value of a numeric feature.")

# class Min(AggregationPrimitive):
#     """Finds the minimum non-null value of a numeric feature."""
#     name = "min"
#     input_types =  [Numeric]
#     return_type = None
#     # max_stack_depth = 1
#     stack_on_self = False

#     def get_function(self):
#         return np.min

Ejemplo n.º 34
0
def FT_process(tables, config):
    es = ft.EntitySet()
    entity_config = config['tables']
    relation_config = config['relations']
    flag = 0
    for table in tables:
        id = f'{table}_id'  # 主键
        make_id = True
        if len(table.split("_")) > 2:  # 中间表
            id = table[6:]
            make_id = False
        if table == CONSTANT.MAIN_TABLE_NAME:  # "main"
            tables[table][id] = tables[table].index
            cat_cols = [
                col for col in tables[table].columns
                if col.startswith("c_") and not col.startswith("c_0")
            ]
            if len(cat_cols) > 10:
                flag = 1
            make_id = False

        variable_Types = {}
        for col in tables[table].columns:
            if col.startswith(CONSTANT.MULTI_CAT_PREFIX):
                variable_Types[col] = ft.variable_types.Categorical
            if col.startswith(CONSTANT.CATEGORY_PREFIX):
                variable_Types[col] = ft.variable_types.Categorical
        '''
        if config['time_col'] in tables[table] and table == "main":  # modified 4.22, time_index的设置
            es = es.entity_from_dataframe(entity_id=table,
                                          dataframe=tables[table],
                                          make_index=make_id,
                                          index=id,
                                          time_index=config['time_col'],
                                          variable_types=variable_Types
                                          )
            #print(table,"using time_index")
        else:
            es = es.entity_from_dataframe(entity_id=table,
                                          dataframe=tables[table],
                                          make_index=make_id,
                                          index=id,
                                          variable_types=variable_Types
                                          )
        '''
        es = es.entity_from_dataframe(entity_id=table,
                                      dataframe=tables[table],
                                      make_index=make_id,
                                      index=id,
                                      variable_types=variable_Types)

        # print(es[table].variables)

    for relation in relation_config:
        tableA = relation['table_A']
        tableB = relation['table_B']
        key = relation['key'][0]
        new_relationship = ft.Relationship(es[tableB][key], es[tableA][key])
        es = es.add_relationship(new_relationship)
    '''
    ct = pd.DataFrame()
    c_id = f'{CONSTANT.MAIN_TABLE_NAME}_id'
    ct[c_id] = tables[CONSTANT.MAIN_TABLE_NAME].index
    ct["time"] = tables[CONSTANT.MAIN_TABLE_NAME][config['time_col']].values
    time0 = ct["time"].min()
    time1 = ct["time"].max()
    timeBucket = (time1 - time0) / 20
    if "timeBucket" not in config:
        config["timeBucket"] = timeBucket.total_seconds()
        config["window_number"] = 5
    '''
    # print(config["timeBucket"])
    # cluster = LocalCluster()
    '''
    if mark ==1: # modified 4.23
        feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="main", agg_primitives=["mean", "sum", "count"],
                                              trans_primitives=["hour", "weekday"],
                                              max_depth=2,
                                              cutoff_time=ct,
                                              training_window=ft.Timedelta(config["window_number"] * config["timeBucket"], "s"), # 参数可调
                                              approximate=ft.Timedelta(config["timeBucket"], "s"),  # 参数可调
                                              # n_jobs=3,
                                              cutoff_time_in_index=True # 参数可调
                                              )
        # print(feature_defs)
        feature_matrix.reset_index(1, drop=False, inplace=True)
        feature_matrix.rename(columns={'time': 't_01'}, inplace=True)
        print("Using Cutting off Time")
    else:
    '''
    def n_unique(column):
        return len(set(column))

    def nunique2(column):
        l1 = len(column)
        return l1 * 1.0 / len(set(column))

    def n_time(column):
        return (column.max() - column.min()).total_seconds()

    def n_time2(column):
        return (column - column.min()).apply(lambda s: s.total_seconds())

    nunique = make_agg_primitive(function=n_unique,
                                 input_types=[Categorical],
                                 return_type=Numeric)
    # ntime = make_agg_primitive(function=n_time, input_types=[Datetime], return_type=Numeric)
    # ntime2 = make_trans_primitive(function=n_time2, input_types=[Datetime], return_type=Numeric)
    if flag == 0:
        agg_trans = ["mean", "sum", "count", nunique]
    else:
        agg_trans = ["mean", "sum", "count"]

    feature_matrix, feature_defs = ft.dfs(
        entityset=es,
        target_entity="main",
        agg_primitives=agg_trans,  # "num_unique"太耗时
        trans_primitives=[],  # ["hour", "weekday"],
        max_depth=2)
    print(feature_defs)
    # feature_matrix.columns = ["m_"+c if ((".c_" in c) or (".m_" in c)) and ("MEAN" not in c) and ("SUM" not in c) and ("COUNT" not in c) else c for c in feature_matrix.columns]
    return feature_matrix