Example #1
0
    def _generate_features(self, input_df):
        df = input_df.copy()
        df["id"] = df.index + 1

        es = ft.EntitySet(id="data")
        es = es.entity_from_dataframe(entity_id="time_seq",
                                      dataframe=df,
                                      index="id",
                                      time_index=self.dt_col)

        def is_awake(column):
            hour = column.dt.hour
            return (((hour >= 6) & (hour <= 23)) | (hour == 0)).astype(int)

        def is_busy_hours(column):
            hour = column.dt.hour
            return (((hour >= 7) & (hour <= 9)) | (hour >= 16) &
                    (hour <= 19)).astype(int)

        IsAwake = make_trans_primitive(function=is_awake,
                                       input_types=[DatetimeTimeIndex],
                                       return_type=Numeric)
        IsBusyHours = make_trans_primitive(function=is_busy_hours,
                                           input_types=[DatetimeTimeIndex],
                                           return_type=Numeric)

        feature_matrix, feature_defs = ft.dfs(entityset=es,
                                              target_entity="time_seq",
                                              agg_primitives=["count"],
                                              trans_primitives=[
                                                  "month", "weekday", "day",
                                                  "hour", "is_weekend",
                                                  IsAwake, IsBusyHours
                                              ])
        return feature_matrix, feature_defs
def test_make_transform_restricts_time_arg():
    make_trans_primitive(lambda time: time, [Datetime],
                         Numeric,
                         name="AllowedPrimitive",
                         description="This primitive should be accepted",
                         uses_calc_time=True)

    with pytest.raises(ValueError):
        make_trans_primitive(lambda time: time, [Datetime],
                             Numeric,
                             name="BadPrimitive",
                             description="This primitive should erorr")
Example #3
0
def test_make_transform_restricts_time_arg():
    make_trans_primitive(lambda time: time, [Datetime],
                         Numeric,
                         name="AllowedPrimitive",
                         description="This primitive should be accepted",
                         uses_calc_time=True)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_trans_primitive(lambda time: time, [Datetime],
                             Numeric,
                             name="BadPrimitive",
                             description="This primitive should erorr")
def test_make_transform_sets_kwargs_correctly(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in,
        [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin_1_list = ["toothpaste", "coke_zero"]
    isin_1_base_f = Feature(es['log']['product_id'])
    isin_1 = IsIn(isin_1_base_f, list_of_outputs=isin_1_list)
    isin_2_list = ["coke_zero"]
    isin_2_base_f = Feature(es['log']['session_id'])
    isin_2 = IsIn(isin_2_base_f, list_of_outputs=isin_2_list)
    assert isin_1_base_f == isin_1.base_features[0]
    assert isin_1_list == isin_1.kwargs['list_of_outputs']
    assert isin_2_base_f == isin_2.base_features[0]
    assert isin_2_list == isin_2.kwargs['list_of_outputs']
def test_make_transform_restricts_time_arg():
    make_trans_primitive(
        lambda time: time,
        [Datetime],
        Numeric,
        name="AllowedPrimitive",
        description="This primitive should be accepted",
        uses_calc_time=True)

    with pytest.raises(ValueError):
        make_trans_primitive(
            lambda time: time,
            [Datetime],
            Numeric,
            name="BadPrimitive",
            description="This primitive should erorr")
def test_make_transform_sets_kwargs_correctly(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in, [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin_1_list = ["toothpaste", "coke_zero"]
    isin_1_base_f = Feature(es['log']['product_id'])
    isin_1 = IsIn(isin_1_base_f, list_of_outputs=isin_1_list)
    isin_2_list = ["coke_zero"]
    isin_2_base_f = Feature(es['log']['session_id'])
    isin_2 = IsIn(isin_2_base_f, list_of_outputs=isin_2_list)
    assert isin_1_base_f == isin_1.base_features[0]
    assert isin_1_list == isin_1.kwargs['list_of_outputs']
    assert isin_2_base_f == isin_2.base_features[0]
    assert isin_2_list == isin_2.kwargs['list_of_outputs']
def test_warns_with_unused_custom_primitives(pd_es):
    def above_ten(column):
        return column > 10

    AboveTen = make_trans_primitive(function=above_ten,
                                    input_types=[Numeric],
                                    return_type=Numeric)

    trans_primitives = [AboveTen]

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  trans_primitives: ['above_ten']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=pd_es,
            target_entity='sessions',
            trans_primitives=trans_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='customers',
            trans_primitives=trans_primitives,
            max_depth=1)

    def max_above_ten(column):
        return max(column) > 10

    MaxAboveTen = make_agg_primitive(function=max_above_ten,
                                     input_types=[Numeric],
                                     return_type=Numeric)

    agg_primitives = [MaxAboveTen]

    warning_text = "Some specified primitives were not used during DFS:\n" + \
        "  agg_primitives: ['max_above_ten']\n" + \
        "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \
        "or it may indicate no compatible variable types for the primitive were found in the data."

    with pytest.warns(UnusedPrimitiveWarning) as record:
        dfs(entityset=pd_es,
            target_entity='stores',
            agg_primitives=agg_primitives,
            max_depth=1)

    assert record[0].message.args[0] == warning_text

    # Should not raise a warning
    with pytest.warns(None) as record:
        dfs(entityset=pd_es,
            target_entity='sessions',
            agg_primitives=agg_primitives,
            max_depth=1)
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in,
        [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = IsIn(es['log']['product_id'],
                list_of_outputs=["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_isin_feat_custom(es):
    def pd_is_in(array, list_of_outputs=None):
        if list_of_outputs is None:
            list_of_outputs = []
        return pd.Series(array).isin(list_of_outputs)

    def isin_generate_name(self):
        return u"%s.isin(%s)" % (self.base_features[0].get_name(),
                                 str(self.kwargs['list_of_outputs']))

    IsIn = make_trans_primitive(
        pd_is_in,
        [Variable],
        Boolean,
        name="is_in",
        description="For each value of the base feature, checks whether it is "
        "in a list that is provided.",
        cls_attributes={"generate_name": isin_generate_name})

    isin = IsIn(es['log']['product_id'],
                list_of_outputs=["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v

    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
Example #10
0
    '''
    assert string is not None, "string to count needs to be defined"
    counts = [element.lower().count(string) for element in column]
    return counts


# %%

def string_count_get_name(self):
    return u"STRING_COUNT(%s, %s)" % (self.base_features[0].get_name(),
                                      '"' + str(self.kwargs['string'] + '"'))


# %%
StringCount = make_trans_primitive(function=string_count,
                                   input_types=[Text],
                                   return_type=Numeric,
                                   cls_attributes={"get_name": string_count_get_name})

# %%
from featuretools.tests.testing_utils import make_ecommerce_entityset

es = make_ecommerce_entityset()
count_the_feat = StringCount(es['log']['comments'], string="the")


# 原始日志数据
# %%
es['log'].df.head()
# %% md
# 统计日志表的评论字段出现the的求和值、平均值、标准差
# %%
def get_results(request):
    max_depth = request.POST['max_depth']
    agg_pri = request.POST.getlist('agg_pri')
    agg_pri_customer = request.POST.getlist('agg_pri_customer')
    trans_pri_customer = request.POST.getlist('trans_pri_customer')
    trans_pri = request.POST.getlist('trans_pri')

    context = {'max_depth': max_depth, 'agg_pri': agg_pri, 'trans_pri': trans_pri}

    import featuretools as ft
    import pandas as pd
    import numpy as np
    from featuretools.primitives import make_trans_primitive, make_agg_primitive
    from featuretools.variable_types import DatetimeTimeIndex, Numeric

    pd.set_option('display.max_columns', 20)
    data = ft.demo.load_mock_customer()
    transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])
    products_df = data["products"]

    es = ft.EntitySet()
    s = es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_df, index="transaction_id",
                                 time_index="transaction_time",
                                 variable_types={"product_id": ft.variable_types.Categorical,
                                                 "zip_code": ft.variable_types.ZIPCode})

    es = es.entity_from_dataframe(entity_id="products", dataframe=products_df, index="product_id")

    new_relationship = ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"])

    es = es.add_relationship(new_relationship)

    es = es.normalize_entity(base_entity_id="transactions",
                             new_entity_id="sessions",
                             index="session_id",
                             make_time_index="session_start",
                             additional_variables=["device", "customer_id", "zip_code", "session_start", "join_date"])

    es = es.normalize_entity(base_entity_id="sessions",
                             new_entity_id="customers",
                             index="customer_id",
                             make_time_index="join_date",
                             additional_variables=["zip_code", "join_date"])

    # feature_matrix1, feature_defs1 = ft.dfs(entityset=es, target_entity="products")
    #
    # feature_matrix2, feature_defs2 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=["count"],
    #                                         trans_primitives=["month"], max_depth=1)

    """
    自定义agg_primitives:
    改写time since last,原函数为秒,现在改为小时输出
    """
    def time_since_last_by_hour(values, time=None):
        time_since = time - values.iloc[-1]
        return time_since.total_seconds() / 3600

    Time_since_last_by_hour = make_agg_primitive(function=time_since_last_by_hour,
                                                 input_types=[DatetimeTimeIndex],
                                                 return_type=Numeric,
                                                 uses_calc_time=True)

    """
    自定义trans_primitives:
    添加log e 的自然对数
    """
    import numpy as np

    def log(vals):
        return np.log(vals)

    # def generate_name(self, base_feature_names):
    #     return "-(%s)" % (base_feature_names[0])
    log = make_trans_primitive(function=log,
                               input_types=[Numeric],
                               return_type=Numeric,
                               # uses_calc_time=True,
                               description="Calculates the log of the value.",
                               name="log")

    # 将前端页面的提交参数,保存为agg_pri列表
    agg_pri = context['agg_pri']
    trans_pri = context['trans_pri']

    # 如果勾选了参数,加上自定义的Time_since_last_by_hour
    if 'Time_since_last_by_hour' in agg_pri_customer:
        agg_pri.append(Time_since_last_by_hour)
    if 'log_e' in trans_pri_customer:
        trans_pri.append(log)
    # 生成新的特征融合矩阵
    feature_matrix3, feature_defs3 = ft.dfs(entityset=es, target_entity="customers",
                                            agg_primitives=agg_pri,
                                            trans_primitives=trans_pri,
                                            max_depth=int(context['max_depth']))
    res = []
    for i in feature_defs3:
        res.append(str(i))

    sample_data = [i for i in feature_matrix3.iloc[0]]
    return render(request, 'get_results.html', {'res': res, 'sample_data': sample_data})
Example #12
0
        n_collinear, correlation_threshold))

    total_removed = n_missing_cols + n_zero_variance_cols + n_collinear

    print('Total columns removed: ', total_removed)
    print('Shape after feature selection: {}.'.format(feature_matrix.shape))
    return feature_matrix


# before we get into things, let's do all the featuretools definitions
def log_plus_one(column):
    return np.log(column + min(column) + 1)


lpo = make_trans_primitive(function=log_plus_one,
                           input_types=[Numeric],
                           return_type=Numeric)


def abs_log(column):
    return np.log(np.abs(column) + 1)


al = make_trans_primitive(function=abs_log,
                          input_types=[Numeric],
                          return_type=Numeric)


def squared(column):
    return np.square(column)
print(feature_enc)

print('-----------list primitives---------------------')
print(ft.list_primitives().head())

print('----------custom primitives----------------------')
from featuretools.primitives import make_agg_primitive, make_trans_primitive
from featuretools.variable_types import Text, Numeric


def absolute(column):
    return abs(column)


Absolute = make_trans_primitive(function=absolute,
                                input_types=[Numeric],
                                return_type=Numeric)


def maximum(columns):
    return max(columns)


Maximum = make_agg_primitive(function=maximum,
                             input_types=[Numeric],
                             return_type=Numeric)


#Multiple Input Types
def mean_numeric(num1, num2):
    return (num1 + num2) / 2
Example #14
0
def get_results(request):
    try:
        import featuretools as ft
        import pandas as pd
        import numpy as np
        from featuretools.primitives import make_trans_primitive, make_agg_primitive

        # 数据源相关的参数
        types_dict = eval(request.COOKIES['types_dict'])
        columns_dict = eval(request.COOKIES['columns_dict'])
        target = request.COOKIES['target']

        # 如何决定 base entity?
        # 目前思路是由 id 类型最多的 entity 来做 base entity
        # 把对应的表和id个数封装成字典,然后根据个数给表名排逆序,然后按照这个顺序merge表,是为最终思路
        base_entity = ''
        base_index = ''

        max_count = 0
        sorted_dict = {}
        for k, v in types_dict.items():
            count = 0

            index = ''
            for i in v:
                if '.Id' in str(i):
                    count += 1
                if '.Index' in str(i):
                    index = i
            sorted_dict[k] = count
            if count > max_count:
                base_entity = k
                base_index = index
                max_count = count
        sorted_list = sorted(sorted_dict.items(),
                             key=lambda item: item[1],
                             reverse=True)
        sorted_table_name = [i[0] for i in sorted_list]

        print("sorted_table_name\n", sorted_table_name)

        # 把columns 和对应的 类型拼接成字典,存在一个列表中,并且找到base_index
        types_dict_list = []
        entity_name_list = []
        for key, values1, values2 in zip(columns_dict.keys(),
                                         columns_dict.values(),
                                         types_dict.values()):
            types_dict_list.append(
                {k: eval(v)
                 for k, v in zip(values1, values2)})
            entity_name_list.append(key)
            if key == base_entity:
                for k, v in zip(values2, values1):
                    if '.Index' in k:
                        base_index = v

        # 自动识别标记为Index的特征,并作为抽取实体的index参数,传入模型
        # 把所有的类型字典拼成一个大字典
        index_list = []
        total_type_dict = {}
        for each_dict in types_dict_list:
            total_type_dict.update(each_dict)
            for k, v in each_dict.items():
                if '.Index' in str(v):
                    index_list.append(k)
        print(index_list)
        # print(total_type_dict)

        # 原表全部join在一起之后再抽取实体
        # 数据接口改成处理CSV结构
        import os
        import re
        if not os.path.isdir(os.getcwd() + "/demo_data"):
            os.mkdir(os.getcwd() + "/demo_data")
        os.chdir(os.getcwd() + "/demo_data")
        regex = re.compile("csv")
        raw_dict = {}

        for file in os.listdir(os.getcwd()):
            if re.search(regex, file):
                raw_dict[file.split(".")[0]] = pd.read_csv(file)

        data = raw_dict
        os.chdir("..")

        # todo : merge的逻辑比较复杂,要如何执行join操作??
        if len(data) == 0:
            raise Exception("数据源为空,请检查数据源文件")
        elif len(data) > 1:
            data_df = data.pop(sorted_table_name.pop(0))
            # print(data_df)
            for i in sorted_table_name:
                data_df = data_df.merge(data[i])
            #
            # for i in list(data.values()):
            #     data_df = data_df.merge(i)

        elif len(data) == 1:
            data_df = list(data.values())[0]
        es = ft.EntitySet()

        # print("+++++++++++++++++++++++")
        # print("data_df\n", data_df)
        # print("entity_id\n", base_entity)
        # print("base_index\n", base_index)
        # print("total_type_dict\n", total_type_dict)
        # print("+++++++++++++++++++++++")
        # 构造base entity, 将第一个表名作为基础实体名称
        es = es.entity_from_dataframe(
            entity_id=base_entity,
            dataframe=data_df,
            index=base_index,
            # time_index="transaction_time",
            variable_types=total_type_dict)

        # 基于base entity抽取实体,逻辑比较复杂,基本逻辑是作为base entity的字段,跳过实体抽取,其余的将index 字段单独存储,设为index参数
        for k, v in columns_dict.items():
            if k == base_entity:
                continue
            index = ''
            for i in index_list:
                if i in v:
                    v.remove(i)
                    index = i
            # print("=========")
            # print(k)
            # print(index)
            # print(v)
            # print("=========")
            es = es.normalize_entity(
                base_entity_id=base_entity,
                new_entity_id=k,
                index=index,
                # make_time_index="session_start",
                additional_variables=v)
        """
        自定义agg_primitives:
        改写time since last,原函数为秒,现在改为小时输出
        """
        def time_since_last_by_hour(values, time=None):
            time_since = time - values.iloc[-1]
            return time_since.total_seconds() / 3600

        Time_since_last_by_hour = make_agg_primitive(
            function=time_since_last_by_hour,
            input_types=[ft.variable_types.DatetimeTimeIndex],
            return_type=ft.variable_types.Numeric,
            uses_calc_time=True)
        """
        自定义trans_primitives:
        添加log e 的自然对数
        """
        import numpy as np

        def log(vals):
            return np.log(vals)

        # def generate_name(self, base_feature_names):
        #     return "-(%s)" % (base_feature_names[0])
        log = make_trans_primitive(
            function=log,
            input_types=[ft.variable_types.Numeric],
            return_type=ft.variable_types.Numeric,
            # uses_calc_time=True,
            description="Calculates the log of the value.",
            name="log")
        """
        自定义trans_primitives:
        判断是否为正数
        """
        import numpy as np

        def is_positive(vals):
            return vals > 0

        # def generate_name(self, base_feature_names):
        #     return "-(%s)" % (base_feature_names[0])
        is_positive = make_trans_primitive(
            function=is_positive,
            input_types=[ft.variable_types.Numeric],
            return_type=ft.variable_types.Boolean,
            # uses_calc_time=True,
            description="Calculates if the value positive.",
            name="is_positive")

        # 模型相关的参数
        max_depth = request.POST['max_depth']
        agg_pri = request.POST.getlist('agg_pri')
        agg_pri_customer = request.POST.getlist('agg_pri_customer')
        trans_pri_customer = request.POST.getlist('trans_pri_customer')
        trans_pri = request.POST.getlist('trans_pri')
        context = {
            'max_depth': max_depth,
            'agg_pri': agg_pri,
            'trans_pri': trans_pri
        }

        pd.set_option('display.max_columns', 20)

        # 将前端页面的提交参数,保存为agg_pri列表
        agg_pri = context['agg_pri']
        trans_pri = context['trans_pri']
        print(trans_pri_customer)
        # 如果勾选了参数,加上自定义的Time_since_last_by_hour
        if 'Time_since_last_by_hour' in agg_pri_customer:
            agg_pri.append(Time_since_last_by_hour)
        if 'log_e' in trans_pri_customer:
            trans_pri.append(log)
        if 'is_positive' in trans_pri_customer:
            trans_pri.append(is_positive)
        print("+++++++++++++++++++++++++++++")

        print(trans_pri)
        print("+++++++++++++++++++++++++++++")
        # 生成新的特征融合矩阵
        feature_matrix, feature_defs = ft.dfs(entityset=es,
                                              target_entity=target,
                                              agg_primitives=agg_pri,
                                              trans_primitives=trans_pri,
                                              max_depth=int(
                                                  context['max_depth']))

        # 将索引作为第一列插入数据矩阵
        feature_matrix = feature_matrix.reset_index()
        new_columns = feature_matrix.columns

        # 保存数据矩阵,注意在特征选择界面,没有 customer_id 作为选项,因为这只是索引
        # nlp 数组是将primitives替换为中文后的表头,一并显示在第二行
        import os
        if not os.path.isdir(os.getcwd() + "/demo_data/result"):
            os.mkdir(os.getcwd() + "/demo_data/result")
        feature_matrix.to_csv("./demo_data/result/all_features.csv",
                              index=False)
        # print(feature_matrix.head(5))
        from .columns2NLP import columns2NLP
        res = []
        nlp = []
        for i in new_columns:
            res.append(str(i))
            nlp.append(columns2NLP(str(i)))
        # print(res[0])
        # print("======================")
        # print(res)
        # print(nlp)
        # print("======================")
        # 将所有的浮点数精度调整到小数点后两位
        sample_data1 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[0]
        ]
        sample_data2 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[1]
        ]
        sample_data3 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[2]
        ]
        sample_data4 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[3]
        ]
        sample_data5 = [
            round(i, 2) if isinstance(i, float) else i
            for i in feature_matrix.iloc[4]
        ]
        response = render(
            request, 'get_results.html', {
                'res': res,
                'nlp': nlp,
                'sample_data1': sample_data1,
                'sample_data2': sample_data2,
                'sample_data3': sample_data3,
                'sample_data4': sample_data4,
                'sample_data5': sample_data5
            })
        response.set_cookie('target_id', res[0])
        return response

    except Exception as e:
        response = render(request, 'erro.html', {'erro': e})
        return response
# finally let's import the data
df = pd.read_csv("creditcard.csv")
df = df.drop(
    ['Time'], axis=1
)  #,'V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8'], axis =1)
df = df.dropna()


# before we get into things, let's do all the featuretools definitions
def abs_log(column):
    return np.log(np.abs(column) + 1)


al = make_trans_primitive(function=abs_log,
                          input_types=[Numeric],
                          return_type=Numeric)


def squared(column):
    return np.square(column)


sq = make_trans_primitive(function=squared,
                          input_types=[Numeric],
                          return_type=Numeric)


def bins_5(column):
    temp = preprocessing.KBinsDiscretizer(n_bins=5,
                                          encode='ordinal',
Example #16
0
# %%
# custom function so the name of the feature prints out correctly
def make_name(self):
    return "%s_goal_last_%d" % (self.kwargs['Qty1'], self.kwargs['Qty2'])


# %%



# %%
def compare_Qty(Qty1,Qty2):
   return Qty1>Qty2
CompareMove = make_trans_primitive(function=compare_Qty,
                                          input_types=[Numeric, Numeric],
                                          return_type=Boolean,
                                          description="compare_Qty"
                                          #cls_attributes={"generate_name": make_name, "uses_full_entity":True}
                                          )

input_vars = [es["Machine"]["MOVE_QTY"], es["Machine"]["WIP_QTY"]]


# Compare_Move = CompareMove(*input_vars)
# #Compare_Move = CompareMove(Qty1=es["Machine"]["MOVE_QTY"], Qty2=es["Machine"]["WIP_QTY"])

# features = [Compare_Move]


# fm = ft.calculate_feature_matrix(entityset=es, features=features) 

Example #17
0
自定义trans_primitives:
添加log e 的自然对数
"""
import numpy as np


def log(vals):
    return np.log(vals)


# def generate_name(self, base_feature_names):
#     return "-(%s)" % (base_feature_names[0])
log = make_trans_primitive(
    function=log,
    input_types=[Numeric],
    return_type=Numeric,
    # uses_calc_time=True,
    description="Calculates the log of the value.",
    name="log")

# 生成新的特征融合矩阵
feature_matrix3, feature_defs3 = ft.dfs(
    entityset=es,
    target_entity="customers",
    agg_primitives=['count', 'mean', 'sum', 'min', 'max'],
    trans_primitives=['month'],
    max_depth=3)

print(feature_matrix3)
# 将索引作为第一列插入数据矩阵
feature_matrix3 = feature_matrix3.reset_index()
Example #18
0
自定义trans_primitives:
添加log e 的自然对数
"""
import numpy as np


def log(vals):
    return np.log(vals)


# def generate_name(self, base_feature_names):
#     return "-(%s)" % (base_feature_names[0])
log = make_trans_primitive(
    function=log,
    input_types=[ft.variable_types.Numeric],
    return_type=ft.variable_types.Numeric,
    # uses_calc_time=True,
    description="Calculates the log of the value.",
    name="log")
"""
自定义trans_primitives:
判断是否为正数
"""
import numpy as np


def is_positive(vals):
    return vals > 0


# def generate_name(self, base_feature_names):