コード例 #1
0
def gen_auto_feats():
    action["userid"] = action["userid"].astype("str")
    future["userid"] = future["userid"].astype("str")
    history["orderTime"] = history["orderTime"].apply(lambda x:get_date(int(x)))
    history["userid"] = history["userid"].astype("str")
    history["orderid"] = history["orderid"].astype("str")
    history["orderType"] = history["orderType"].astype("str")
    comment["userid"] = comment["userid"].astype("str")
    comment["orderid"] = comment["orderid"].astype("str")
    profile["userid"] = profile["userid"].astype("str")

    es = ft.EntitySet(id="train")
    es = es.entity_from_dataframe(entity_id="userProfile", dataframe=profile, index="userid")
    es = es.entity_from_dataframe(entity_id="userComment", dataframe=comment, index="userid")
    es = es.entity_from_dataframe(entity_id="orderHistory", dataframe=history, index="orderid", time_index="orderTime",
                                  variable_types={"orderType" : ft.variable_types.Categorical})

    relationship_1 = ft.Relationship(es["userProfile"]["userid"], es["userComment"]["userid"])
    es = es.add_relationship(relationship_1)
    relationship_2 = ft.Relationship(es["orderHistory"]["orderid"], es["userComment"]["orderid"])
    es = es.add_relationship(relationship_2)

    feature, _ = ft.dfs(entityset=es, target_entity="userProfile")
    feature = feature.T.drop_duplicates(keep='first').T.reset_index()
    feature = pd.merge(future, feature, how='left', on="userid")
    feature = auto_feats_process(feature)
    feature.fillna(0)
    return feature
コード例 #2
0
def engineer_features_by_featuretools(model_params, race_master_df, race_table_result_df, race_past_x_result_df):
    es = ft.EntitySet(id='netkeiba')
    es.entity_from_dataframe(entity_id='race_master',
                             dataframe=race_master_df[model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_MASTER'] +
                                                      model_params['FEATURETOOLS_PARAMS']['FEATURE_COL'][
                                                          'RACE_MASTER']],
                             index='race_id')
    es.entity_from_dataframe(entity_id='race_table',
                             dataframe=race_table_result_df[
                                 model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_TABLE_RESULT'] +
                                 model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_TABLE_RESULT']],
                             index='race_horse_id')
    es.entity_from_dataframe(entity_id='race_past_x',
                             dataframe=race_past_x_result_df[
                                 model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_PAST_X_RESULT'] +
                                 model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_PAST_X_RESULT']],
                             index='race_horse_past_x_id')

    r_master_table = ft.Relationship(es['race_master']['race_id'], es['race_table']['race_id'])
    r_table_past_x = ft.Relationship(es['race_table']['race_horse_id'], es['race_past_x']['race_horse_id'])

    es.add_relationships(relationships=[r_master_table])
    es.add_relationships(relationships=[r_table_past_x])

    feature_matrix_df, _ = ft.dfs(
        entityset=es,
        target_entity='race_table',
        agg_primitives=model_params['FEATURETOOLS_PARAMS']['PRIMITIVES']['aggregation'],
        trans_primitives=model_params['FEATURETOOLS_PARAMS']['PRIMITIVES']['transform'],
        max_depth=2
    )
    feature_matrix_df = feature_matrix_df.fillna(0)
    table_index_df = decode_race_horse_id(feature_matrix_df)
    feature_matrix_df = feature_matrix_df.reset_index(drop=True)
    return feature_matrix_df, table_index_df
コード例 #3
0
def create_es(entities, relationships, target, entityset_name="Demo"):

    # Entity set with id applications
    es = ft.EntitySet(id=entityset_name)

    for entity_name, entity_values in entities.items():

        if entity_values["index_col"] is not None:
            es = es.entity_from_dataframe(
                entity_id=entity_name,
                dataframe=entity_values["df"],
                index=entity_values["index_col"],
                variable_types=entity_values["df_type"])
        else:
            es = es.entity_from_dataframe(
                entity_id=entity_name,
                dataframe=entity_values["df"],
                make_index=True,
                index=entity_name + "_id",
                variable_types=entity_values["df_type"])

    relationship_list = []
    for r in relationships:
        r_parent_df, r_parent_col, r_child_df, r_child_col = r
        relationship = ft.Relationship(es[r_parent_df][r_parent_col],
                                       es[r_child_df][r_child_col])
        relationship_list.append(relationship)
    es = es.add_relationships(relationship_list)

    return es
コード例 #4
0
def add_relationship(entityset, parent, parent_column, child, child_column):
    parent_variable = entityset[parent][parent_column]
    child_variable = entityset[child][child_column]
    relationship = ft.Relationship(parent_variable, child_variable)
    entityset.add_relationship(relationship)

    return entityset
コード例 #5
0
def merge_featuretools(df_parent, df_related, parent_column, related_column,
                       date_column):
    """Automated feature engineering

    More info:

    https://www.featuretools.com
    https://github.com/featuretools/featuretools
    https://docs.featuretools.com
    http://www.jmaxkanter.com/static/papers/DSAA_DSM_2015.pdf
    """

    # Create the entityset
    es = ft.EntitySet('parent')

    # Add the entities to the entityset
    es = es.entity_from_dataframe('parent', df_parent, index=parent_column)
    es = es.entity_from_dataframe('relate',
                                  df_related,
                                  make_index=True,
                                  time_index=date_column,
                                  index='related_id')

    # Define the relationships
    relationship = ft.Relationship(es['parent'][parent_column],
                                   es['relate'][related_column])

    # Add the relationships
    es = es.add_relationships([relationship])

    # Deep feature synthesis
    feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='parent')

    return feature_matrix.reset_index()
コード例 #6
0
def test_where_clause_empty_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({
        "id": [1, 2, 3],
        "parent_id": [1, 1, 1],
        "time_index":
        pd.date_range(start='1/1/2018', periods=3),
        "value": [10, 5, 2]
    })

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id")
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index")
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    where = ft.Feature(es["child"]["value"]) == 1
    count = Count(es["child"]['id'], es["parent"], where=where)

    # cutoff time before all rows
    ft.calculate_feature_matrix(entityset=es,
                                features=[count],
                                cutoff_time=pd.Timestamp("12/31/2017"))

    # cutoff time after all rows, but where clause filters all rows
    ft.calculate_feature_matrix(entityset=es,
                                features=[count],
                                cutoff_time=pd.Timestamp("1/4/2018"))
コード例 #7
0
def test_empty_child_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({"id": [1, 2, 3],
                             "parent_id": [1, 1, 1],
                             "time_index": pd.date_range(start='1/1/2018', periods=3),
                             "value": [10, 5, 2]})

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id")
    es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index")
    es.add_relationship(ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017"))
    names = [count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name()]
    assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]])

    # cutoff time after all rows, but where clause filters all rows
    fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018"))
    names = [count_where.get_name(), trend_where.get_name()]
    assert_array_equal(fm2[names], [[0, np.nan]])
コード例 #8
0
    def generate_target_label(self, es):
        """Generates target labels in the case of having missing label in the entityset.

        Args:
            es: fhir entityset.

        Returns:
            Updated entityset with the generated label.

        Raises:
            ValueError: An error occurs if the target label cannot be generated.
        """
        generate_from = 'Period'
        start = self.cutoff_time_label
        end = 'end'
        label_name = self.target_label_column_name
        if (DataLoader().check_column_existence(es, generate_from, start)
                and DataLoader().check_column_existence(
                    es, generate_from, end)):

            if (not DataLoader().check_for_missing_values(
                    es, generate_from, start)
                    and not DataLoader().check_for_missing_values(
                        es, generate_from, end)):

                es[generate_from].df[start] = pd.to_datetime(
                    es[generate_from].df[start])
                es[generate_from].df[end] = pd.to_datetime(
                    es[generate_from].df[end])
                duration = (es[generate_from].df[end] -
                            es[generate_from].df[start]).dt.days
                duration = duration.tolist()
                es[self.target_entity].df[label_name] = duration
                updated_target_entity = es[self.target_entity].df
                duration_df = pd.DataFrame({'object_id': duration})

                es = es.entity_from_dataframe(entity_id='Duration',
                                              dataframe=duration_df,
                                              index='object_id')

                es = es.entity_from_dataframe(entity_id=self.target_entity,
                                              dataframe=updated_target_entity,
                                              index='identifier')
                new_relationship = ft.Relationship(
                    es['Duration']['object_id'],
                    es[self.target_entity][label_name])
                es = es.add_relationship(new_relationship)

                return es

            else:
                raise ValueError(
                    'Can not generate target label {} in table {} beacuse start or end labels in \
                     table {} contain missing value.'.format(
                        label_name, self.target_entity, generate_from))

        else:
            raise ValueError(
                'Can not generate target label {} in table {}.'.format(
                    label_name, self.target_entity))
コード例 #9
0
def create_es(interactions_train, course_df):
    """ создание представления сущностей для featuretools """
    es = ft.EntitySet('user_events')
    es = es.entity_from_dataframe(entity_id="events",
                                  dataframe=interactions_train.copy(),
                                  make_index=True,
                                  index='id',
                                  time_index='date',
                                  variable_types=events_vtypes)
    es = es.entity_from_dataframe(entity_id="steps",
                                  dataframe=course_df.copy(),
                                  index='step_id',
                                  variable_types=course_vtypes)

    es.normalize_entity('events', 'users', 'user_id', make_time_index=False);
    es = es.add_relationship(ft.Relationship(es['steps']['step_id'], es['events']['step_id']))

    lesson_additional_variables = ['lesson_abuse_count', 'lesson_discussions_count', 'lesson_epic_count',
                                   'lesson_passed_by', 'lesson_time_to_complete', 'lesson_title',
                                   'lesson_viewed_by', 'lesson_vote_delta',
                                   'section_id', 'section_position', 'section_title']
    es.normalize_entity('steps', 'lessons', 'lesson_id',
                        additional_variables=lesson_additional_variables,
                        make_time_index=False);

    sections_additional_variables = ['section_position', 'section_title']
    es.normalize_entity('lessons', 'sections', 'section_id',
                        additional_variables=sections_additional_variables,
                        make_time_index=False);

    es["events"]["action"].interesting_values = interactions_train.action.unique().categories
    es["steps"]["step_block.name"].interesting_values = course_df['step_block.name'].unique()

    return es
コード例 #10
0
def load_train_data():
    print('Loading CSV data...')
    applications_df = pd.read_csv(C_PATH + 'application_train.csv')
    previous_df = pd.read_csv(C_PATH + 'previous_application.csv')
    # bureau_df = pd.read_csv(C_PATH + 'bureau.csv')

    print("Creating entityset...")
    es = ft.EntitySet(id="home-credit")

    print("Loading applications entity...")
    es = es.entity_from_dataframe(entity_id="applications",
                                  dataframe=applications_df,
                                  index="SK_ID_CURR")
    print("Loading previous entity...")
    es = es.entity_from_dataframe(entity_id="previous",
                                  dataframe=previous_df,
                                  index="SK_ID_PREV")
    # print("Loading bureau data...")
    # es = es.entity_from_dataframe(entity_id="bureau", dataframe=bureau_df, index="SK_ID_BUREAU")

    print("Adding relationships...")
    applications_previous = ft.Relationship(es["applications"]["SK_ID_CURR"],
                                            es["previous"]["SK_ID_CURR"])
    es = es.add_relationship(applications_previous)
    # applications_bureau = ft.Relationship(es["applications"]["SK_ID_CURR"], es["bureau"]["SK_ID_CURR"])
    # es = es.add_relationship(applications_bureau)

    # return es

    print("Generating DFS...")
    feature_matrix, feature_defs = ft.dfs(entityset=es,
                                          target_entity="applications",
                                          verbose=True)
    fm_encoded, defs_encoded = ft.encode_features(feature_matrix, feature_defs)
    return fm_encoded, defs_encoded
コード例 #11
0
ファイル: generator.py プロジェクト: ankostas/AML_Risk
    def create_relations(self):
        """
        Add relations to EntitySet object.
        """
        list_relations_in_ft_format = list()
        for rel in self.relations:

            table_0 = rel[0][0]
            col_0 = rel[0][1]
            table_1 = rel[1][0]
            col_1 = rel[1][1]

            if check_col_in_df(self.dict_dataframes[table_0], col_0) and \
                    check_col_in_df(self.dict_dataframes[table_1], col_1):

                rel_ = ft.Relationship(self.entities[table_0][col_0],
                                       self.entities[table_1][col_1])

                list_relations_in_ft_format.append(rel_)

            else:
                raise ValueError(f"Check key column in {table_0} or {table_1}")

        self.entities = self.entities.add_relationships(
            list_relations_in_ft_format)
        logging.info(f"Entityset is created \n {self.entities}")
コード例 #12
0
def get_entityset(holding_data, price_data, trans_data, company_data):
    ''' Construct an entityset data model from different data frames '''

    company_data = company_data.drop(
        ['listing_date', 'delisting_date', 'last_trade_date', 'indices'],
        axis=1)

    es = ft.EntitySet(id="trading")
    es = es.entity_from_dataframe(
        entity_id="prices",
        dataframe=price_data,
        time_index="Date",
        index='index',
        variable_types={"Tick": ft.variable_types.Categorical})
    es = es.entity_from_dataframe(
        entity_id="holdings",
        dataframe=holding_data,
        index='Tick',
        time_index="Date",
        variable_types={"Tick": ft.variable_types.Categorical})
    es = es.entity_from_dataframe(
        entity_id="companies",
        dataframe=company_data,
        index='index',
        time_index="Date",
        variable_types={"Tick": ft.variable_types.Categorical})
    es = es.entity_from_dataframe(entity_id="transactions",
                                  dataframe=trans_data,
                                  index='index',
                                  time_index="Date",
                                  variable_types={
                                      "Tick": ft.variable_types.Categorical,
                                      "Type": ft.variable_types.Categorical
                                  })
    holdings_trans = ft.Relationship(es["holdings"]["Tick"],
                                     es["transactions"]["Tick"])
    es = es.add_relationship(holdings_trans)
    holdings_companies = ft.Relationship(es["holdings"]["Tick"],
                                         es["companies"]["Tick"])
    es = es.add_relationship(holdings_companies)
    holdings_prices = ft.Relationship(es["holdings"]["Tick"],
                                      es["prices"]["Tick"])
    es = es.add_relationship(holdings_prices)
    return es
コード例 #13
0
def agg(train_df, hist_df, new_trans_df, isTrain, x_save_path, y_save_path):
    train_df = train_df.copy(deep=True)
    if isTrain:
        target = train_df['target']
        del train_df['target']
    else:
        target = None

    es_train = ft.EntitySet(id='es_train')
    es_train = es_train.entity_from_dataframe(entity_id='train',
                                              dataframe=train_df,
                                              index='',
                                              time_index='first_active_month')
    es_train = es_train.entity_from_dataframe(entity_id='history',
                                              dataframe=hist_df,
                                              index='',
                                              time_index='purchase_date')
    es_train = es_train.entity_from_dataframe(entity_id='new_trans',
                                              dataframe=new_trans_df,
                                              index='',
                                              time_index='purchase_date')
    # Relationship between clients and previous loans
    r_client_previous = ft.Relationship(es_train['train']['card_id'],
                                        es_train['history']['card_id'])

    # Add the relationship to the entity set
    es_train = es_train.add_relationship(r_client_previous)
    r_client_previous = ft.Relationship(es_train['train']['card_id'],
                                        es_train['new_trans']['card_id'])

    # Add the relationship to the entity set
    es_train = es_train.add_relationship(r_client_previous)
    print(" dfs ing ... ")
    x_train, _ = ft.dfs(entityset=es_train, target_entity='train', max_depth=2)
    send_msg("dfs done! ")
    print("saving...")
    if target:
        target.to_csv(y_save_path)
        x_train['index'] = target.index
        x_train.set_index('index')
    x_train.to_csv(x_save_path)

    return x_train, target
コード例 #14
0
def test_serialization(es):
    relationship = ft.Relationship(es['sessions']['id'], es['log']['session_id'])

    dictionary = {
        'parent_entity_id': 'sessions',
        'parent_variable_id': 'id',
        'child_entity_id': 'log',
        'child_variable_id': 'session_id',
    }
    assert relationship.to_dictionary() == dictionary
    assert ft.Relationship.from_dictionary(dictionary, es) == relationship
コード例 #15
0
def multitable_d3m_to_entityset(inpath):
    with open(os.path.join(inpath, 'data/dataSchema.json'), 'rb') as f:
        raw_json = json.load(f)

    es = ft.EntitySet(raw_json['dataSchema']['datasetId'])

    relationships = []
    entities = [
        key[8:] for key in raw_json['dataSchema'] if key.startswith("rawData/")
    ]

    d3m_var_to_ft_var = {
        'boolean': Boolean,
        'float': Numeric,
        'zeroToOneFloat': Numeric,
        'integer': Numeric,
        'text': Text,
        'categorical': Categorical,
        'ordinal': Ordinal,
        'dateTime': Datetime
    }

    for entity in entities:
        index = None
        csv_path = os.path.join(inpath, 'data/raw_data/%s.csv' % (entity))
        entityData = raw_json['dataSchema']['rawData/%s' %
                                            (entity)]['rawData/%s' % (entity)]
        var_types = {}
        for varData in entityData:
            if varData['varRole'] == 'index':
                index = varData['varName']
                var_types[varData['varName']] = Index
            elif 'varReference' in varData:
                var_types[varData['varName']] = Id
                parent_entity = varData['varReference']['references'][8:]
                parent_var_id = varData['varReference']['reference_id']
                relationships.append(
                    (parent_entity, parent_var_id, entity, varData['varName']))
            else:
                var_types[varData['varName']] = d3m_var_to_ft_var[
                    varData['varType']]

        es.entity_from_csv(entity,
                           csv_path,
                           index=index,
                           variable_types=var_types)

    for parent_entity, par_var_id, child_entity, child_var_id in relationships:
        relationship = ft.Relationship(es[parent_entity][par_var_id],
                                       es[child_entity][child_var_id])
        es.add_relationship(relationship)

    return es
コード例 #16
0
ファイル: utils.py プロジェクト: thbeh/msc-thesis-bux
def create_entity_set(entityset_name, entityset_quads, entity_relationships):

    es = ft.EntitySet(entityset_name)

    for es_quad in entityset_quads:
        es.entity_from_dataframe(entity_id=es_quad[0],
                                 dataframe=es_quad[1],
                                 index=es_quad[2],
                                 time_index=es_quad[3])

    # if cohorts entity is included
    if len(entityset_quads) > 2:
        for rel in entity_relationships:
            es.add_relationship(
                ft.Relationship(es[rel[0]][rel[2]], es[rel[1]][rel[2]]))
    # if cohorts entity is not included
    elif len(entityset_quads) == 2:
        er = entity_relationships
        es.add_relationship(ft.Relationship(es[er[0]][er[2]],
                                            es[er[1]][er[2]]))
    return es
コード例 #17
0
def create_entity_set(dp: str,
                      sp: list,
                      esc: list,
                      rls: list,
                      od: Any,
                      mt: str,
                      oge: bool = False) -> Any:
    """
    创建实体集

    :param dp: 数据文件所在目录
    :param sp:  跳过文件列表
    :param esc: 定制实体列表
    :param rls: 定制关系列表
    :param od: 读取分块文件后的处理
    :param mt: 主表
    :param oge: 是否仅返回实体
    :return: 返回的实体集
    """

    if os.path.exists(os.path.join(dp, Data_Val.feature_matrix_part_file)):
        Log.debug('跳过创建实体集 {}'.format(dp))
        return None

    data = DealDataFile.get_data_dict_by_path(dp, sp)
    es = ft.EntitySet(id='clients')
    data = od(data)
    # 定制实体
    for x in esc:
        if len(x) != 2:
            return None
        if x[1] in data[x[0]]:
            es = es.entity_from_dataframe(entity_id=x[0],
                                          dataframe=data[x[0]],
                                          index=x[1])
        else:
            es = es.entity_from_dataframe(entity_id=x[0],
                                          dataframe=data[x[0]],
                                          make_index=True,
                                          index=x[1])
    # 定制关系
    r = []
    for x in rls:
        if len(x) != 4:
            return None
        r.append(ft.Relationship(es[x[0]][x[1]], es[x[2]][x[3]]))
    es = es.add_relationships(r)
    if oge:
        return es
    else:
        feature_matrix_from_entity_set(es, dp, mt)
        return None
コード例 #18
0
def get_ft_features(askprice, bidprice, askvolume, bidvolume, others):
    es_train = ft.EntitySet(id='stock')

    es_train = es_train.entity_from_dataframe(entity_id='askprices', dataframe=askprice, index='stock_id', make_index=True)
    es_train = es_train.entity_from_dataframe(entity_id='bidprices', dataframe=bidprice, index='stock_id', make_index=True)
    es_train = es_train.entity_from_dataframe(entity_id='askvolumes', dataframe=askvolume, index='stock_id', make_index=True)
    es_train = es_train.entity_from_dataframe(entity_id='bidvolumes', dataframe=bidvolume, index='stock_id', make_index=True)
    es_train = es_train.entity_from_dataframe(entity_id='otherprices', dataframe=others, index='stock_id', make_index=True)

    r1 = ft.Relationship(es_train['askprices']['stock_id'], es_train['askvolumes']['stock_id'])
    r2 = ft.Relationship(es_train['bidprices']['stock_id'], es_train['bidvolumes']['stock_id'])
    r3 = ft.Relationship(es_train['askprices']['stock_id'], es_train['otherprices']['stock_id'])

    es_train = es_train.add_relationship(r1)
    es_train = es_train.add_relationship(r2)
    es_train = es_train.add_relationship(r3)
    print(es_train)

    features, feature_names = ft.dfs(entityset=es_train, target_entity='askprices')
    print(features)

    return np.array(features)
コード例 #19
0
 def set_es(self):
     self.__es = ft.EntitySet(id="customers")
     self.__es = self.__es.entity_from_dataframe(
         entity_id="customers",
         index="customer_id",
         dataframe=self.__customers_df)
     self.__es = self.__es.entity_from_dataframe(
         entity_id="sessions",
         index="session_id",
         dataframe=self.__sessions_df,
         variable_types={"device": ft.variable_types.Categorical})
     self.__es = self.__es.add_relationship(
         ft.Relationship(self.__es["customers"]["customer_id"],
                         self.__es["sessions"]["customer_id"]))
コード例 #20
0
ファイル: AutoFeat.py プロジェクト: Jie-Yuan/tql-Python
 def add_relationship(self, parent_variable_name, child_variable_name):
     """
         relation = ft.Relationship(es['t1']['id'], es['t2']['id'])
         es.add_relationship(relation)
     :param parent_variable_name: (entity_id, id)
     :param child_variable_name: (entity_id, id)
     :return:
     """
     # relation = ft.Relationship(parent_variable, child_variable)
     relation = ft.Relationship(
         parent_variable=self.es[parent_variable_name[0]][parent_variable_name[1]],
         child_variable=self.es[child_variable_name[0]][child_variable_name[1]]
     )
     self.es.add_relationship(relation)
コード例 #21
0
 def _build_child_entity(self, entity_set, child_entitys_info):
     for entity_name, entity_name_info in child_entitys_info.items():
         parent_entitys = entity_name_info['parent_entity']
         for parent_entity in parent_entitys:
             parent_entity_name = parent_entity['entity_name']
             join_column = parent_entity.get('join_column', None)
             parent_entity_index = self.entity_set_info[parent_entity_name][
                 'index'][0]
             if join_column is None:
                 join_column = parent_entity_index
             entity_set.add_relationship(
                 ft.Relationship(
                     entity_set[parent_entity_name][parent_entity_index],
                     entity_set[entity_name][join_column]))
     return entity_set
コード例 #22
0
def create_entity_set(data: pd.DataFrame, train_table: str,
                      test_table: str) -> ft.EntitySet:
    print(f"\nCreating entity set based on client data")
    start = time.monotonic()
    es = ft.EntitySet(id='clients')

    es = es.entity_from_dataframe(entity_id='combined_train_test',
                                  dataframe=data['combined_train_test'],
                                  index='SK_ID_CURR')

    es = es.entity_from_dataframe(entity_id='bureau',
                                  dataframe=data['bureau'],
                                  index='SK_ID_BUREAU')

    es = es.entity_from_dataframe(entity_id='bureau_balance',
                                  dataframe=data['bureau_balance'],
                                  make_index=True,
                                  index='bureaubalance_index')

    es = es.entity_from_dataframe(entity_id='previous_application',
                                  dataframe=data['previous_application'],
                                  index='SK_ID_PREV')

    es = es.add_relationships([
        ft.Relationship(es['combined_train_test']['SK_ID_CURR'],
                        es['bureau']['SK_ID_CURR']),
        ft.Relationship(es['bureau']['SK_ID_BUREAU'],
                        es['bureau_balance']['SK_ID_BUREAU']),
        ft.Relationship(es['combined_train_test']['SK_ID_CURR'],
                        es['previous_application']['SK_ID_CURR'])
    ])
    end = time.monotonic()

    print(f"  Entity set creation completed in {round(end - start)} seconds")

    return es
コード例 #23
0
def main():
    formatter = '%(asctime)s %(message)s'
    logging.basicConfig(filename='../logs/02_featuretools.log', level=logging.INFO, format=formatter)

    datas = read_all()
    app_train = datas['application_train']
    app_test = datas['application_test']
    bureau = datas['bureau']
    bureau_balance = datas['bureau_balance']
    cash = datas['POS_CASH_balance']
    previous = datas['previous_application']
    installments = datas['installments_payments']
    credit = datas['credit_card_balance']

    app_test["TARGET"] = np.nan
    app = app_train.append(app_test, ignore_index=True, sort=False)

    # Entity set with id applications
    entity_set = ft.EntitySet(id='HomeCredit')

    # Entities with a unique index
    entity_set = entity_set.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR')
    entity_set = entity_set.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU')
    entity_set = entity_set.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV')

    # Entities that do not have a unique index
    entity_set = entity_set.entity_from_dataframe(
        entity_id='bureau_balance', dataframe=bureau_balance, make_index=True, index='bureaubalance_index'
    )
    entity_set = entity_set.entity_from_dataframe(
        entity_id='cash', dataframe=cash, make_index=True, index='cash_index'
    )
    entity_set = entity_set.entity_from_dataframe(
        entity_id='installments', dataframe=installments, make_index=True, index='installments_index'
    )
    entity_set = entity_set.entity_from_dataframe(
        entity_id='credit', dataframe=credit, make_index=True, index='credit_index'
    )

    # Add in the defined relationships
    entity_set = entity_set.add_relationships([
        ft.Relationship(entity_set['app']['SK_ID_CURR'],      entity_set['bureau']['SK_ID_CURR']),
        ft.Relationship(entity_set['bureau']['SK_ID_BUREAU'], entity_set['bureau_balance']['SK_ID_BUREAU']),
        ft.Relationship(entity_set['app']['SK_ID_CURR'],      entity_set['previous']['SK_ID_CURR']),
        ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['cash']['SK_ID_PREV']),
        ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['installments']['SK_ID_PREV']),
        ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['credit']['SK_ID_PREV'])
    ])

    agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode']
    feature_matrix, _ = ft.dfs(
        entityset=entity_set, target_entity='app', agg_primitives=agg_primitives, max_depth=2, features_only=False, verbose=True
    )

    feature_matrix = feature_matrix.reset_index()
    dump(feature_matrix, '../data/02_featuretools/feature_matrix.joblib')
コード例 #24
0
 def add_relation(self, relationships: list):
     '''
     for auto_create add entitys relation.
     Parameters
     --------
     relationships : the entitys relation and relation from parent to child,the format like
                     ['entity1.key1','entity2.key1','entity2.key2','entity3.key2']
     '''
     relationships = [item.split('.') for item in relationships]
     trans_relationships = [
         ft.Relationship(self.auto_create[parent[0]][parent[1]],
                         self.auto_create[child[0]][child[1]])
         for parent, child in zip(relationships[::2], relationships[1::2])
     ]
     self.auto_create = self.auto_create.add_relationships(
         trans_relationships)
コード例 #25
0
def dask_es(make_es):
    es = ft.EntitySet(id=make_es.id)
    for entity in make_es.entities:
        es.entity_from_dataframe(
            entity.id,
            dd.from_pandas(entity.df.reset_index(drop=True), npartitions=4),
            index=entity.index,
            time_index=entity.time_index,
            variable_types=entity.variable_types,
            secondary_time_index=entity.secondary_time_index)

    for rel in make_es.relationships:
        es.add_relationship(
            ft.Relationship(es[rel.parent_entity.id][rel.parent_variable.id],
                            es[rel.child_entity.id][rel.child_variable.id]))
    return es
コード例 #26
0
    def _feature_summary_data(self):
        raceuma_df = self.base_df[[
            "RACE_KEY", "UMABAN", "激走指数", "馬スタート指数", "馬出遅率", "IDM", "騎手指数",
            "テン指数", "ペース指数", "上がり指数", "位置指数", "テンF指数", "中間F指数", "終いF指数",
            "コーナー順位3_1", "コーナー順位4_1", "前3F先頭差_1", "後3F先頭差_1", "レース脚質_1",
            "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "追込率_1",
            "コーナー順位3_2", "コーナー順位4_2", "前3F先頭差_2", "後3F先頭差_2", "レース脚質_2",
            "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "追込率_2",
            "コーナー順位3_3", "コーナー順位4_3", "前3F先頭差_3", "後3F先頭差_3", "レース脚質_3",
            "テン指数結果_3", "上がり指数結果_3", "ペース指数結果_3", "レースP指数結果_3", "追込率_3",
            "コーナー順位3_4", "コーナー順位4_4", "前3F先頭差_4", "後3F先頭差_4", "レース脚質_4",
            "テン指数結果_4", "上がり指数結果_4", "ペース指数結果_4", "レースP指数結果_4", "追込率_4",
            "コーナー順位3_5", "コーナー順位4_5", "前3F先頭差_5", "後3F先頭差_5", "レース脚質_5",
            "テン指数結果_5", "上がり指数結果_5", "ペース指数結果_5", "レースP指数結果_5", "追込率_5"
        ]]
        raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"].astype(
            str).str.cat(raceuma_df["UMABAN"].astype(str))
        raceuma_df.drop("UMABAN", axis=1, inplace=True)
        es = ft.EntitySet(id="race")

        es.entity_from_dataframe(
            entity_id='race',
            dataframe=self.ld.race_df[["RACE_KEY", "target_date"]],
            index="RACE_KEY")
        es.entity_from_dataframe(entity_id='raceuma',
                                 dataframe=raceuma_df,
                                 index="RACE_UMA_KEY")
        relationship = ft.Relationship(es['race']["RACE_KEY"],
                                       es['raceuma']["RACE_KEY"])
        es = es.add_relationship(relationship)
        print(es)
        # 集約関数
        aggregation_list = ['mean', 'skew']
        transform_list = []
        # run dfs
        print("un dfs")
        feature_matrix, features_dfs = ft.dfs(entityset=es,
                                              target_entity='race',
                                              agg_primitives=aggregation_list,
                                              trans_primitives=transform_list,
                                              max_depth=2)
        feature_summary_df = pd.merge(feature_matrix,
                                      self.ld.race_df,
                                      on=["RACE_KEY", "target_date"])
        print("_create_feature: feature_summary_df", feature_summary_df.shape)
        return feature_summary_df
コード例 #27
0
ファイル: entityset_loader.py プロジェクト: MLBazaar/Cardea
    def create_relationships(self, relationships, entity_set):
        """Binds entities in the entityset.

        Args:
            relationships: A dataframe of the relationships in fhir.
            entity_set: The global entityset that the entity will be added to.
        """

        for i, relation in relationships.iterrows():
            # parent table: 0, field: 1
            # child table: 2, field: 3

            new_relationship = ft.Relationship(
                entity_set[relation['parent_entity']][relation['parent_variable']],
                entity_set[relation['child_entity']][relation['child_variable']])

            entity_set.add_relationship(new_relationship)
コード例 #28
0
ファイル: jra_race_haito.py プロジェクト: ikem55/HRsystem
    def _create_feature(self):
        """ マージしたデータから特徴量を生成する """
        print("_create_feature")
        raceuma_df = self.base_df[["RACE_KEY", "UMABAN", "脚質", "距離適性", "父馬産駒連対平均距離", "母父馬産駒連対平均距離", "IDM", "テン指数",
                                   "ペース指数", "上がり指数", "位置指数", "IDM結果_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1",
                                   "先行率_1", "追込率_1", "fa_1_1", "fa_2_1", "fa_3_1", "fa_4_1", "fa_5_1"]]
        raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"] + raceuma_df["UMABAN"]
        raceuma_df.drop("UMABAN", axis=1, inplace=True)
        # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5
        es = ft.EntitySet(id="race")
        es.entity_from_dataframe(entity_id='race', dataframe=self.ld.race_df.drop("NENGAPPI", axis=1), index="RACE_KEY")
        es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY")
        relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"])
        es = es.add_relationship(relationship)
        print(es)
        # 集約関数
        aggregation_list = ['min', 'max', 'mean', 'skew', 'percent_true']
        transform_list = []
        # run dfs
        print("un dfs")
        feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list,
                                              trans_primitives=transform_list, max_depth=2)
        print("_create_feature: feature_matrix", feature_matrix.shape)

        # 予想1番人気のデータを取得
        ninki_df = self.base_df.query("基準人気順位==1")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "情報印", "騎手印",
                                                  "厩舎印", "調教印", "激走印", "展開記号", "輸送区分", "騎手期待単勝率", "騎手期待3着内率", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ",
                                                    "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "放牧先ランク", "厩舎ランク", "調教量評価", "仕上指数変化", "調教評価",
                                                    "IDM", "騎手指数", "情報指数", "総合指数", "人気指数", "調教指数", "厩舎指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数",
                                                    "IDM結果_1", "IDM結果_2"]].add_prefix("人気_").rename(columns={"人気_RACE_KEY":"RACE_KEY"})
        # 逃げ予想馬のデータを取得
        nige_df = self.base_df.query("展開記号=='1'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ",
                                                    "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数",
                                                    "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2",
                                                    "先行率_1", "先行率_2"]].add_prefix("逃げ_").rename(columns={"逃げ_RACE_KEY":"RACE_KEY"})
        # 上がり最速予想馬のデータを取得
        agari_df = self.base_df.query("展開記号=='2'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ",
                                                    "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数",
                                                    "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2",
                                                    "先行率_1", "先行率_2"]].add_prefix("上り_").rename(columns={"上り_RACE_KEY":"RACE_KEY"})

        self.base_df = pd.merge(feature_matrix, nige_df, on="RACE_KEY", how="left")
        self.base_df = pd.merge(self.base_df, agari_df, on="RACE_KEY", how="left")
        self.base_df = pd.merge(self.base_df, ninki_df, on="RACE_KEY")
        self.base_df = pd.merge(self.base_df, self.ld.race_df[["RACE_KEY", "NENGAPPI"]], on="RACE_KEY")
コード例 #29
0
def ks_es(make_es):
    ks = pytest.importorskip('databricks.koalas',
                             reason="Koalas not installed, skipping")
    es = ft.EntitySet(id=make_es.id)
    for entity in make_es.entities:
        cleaned_df = pd_to_ks_clean(entity.df).reset_index(drop=True)
        es.entity_from_dataframe(
            entity.id,
            ks.from_pandas(cleaned_df),
            index=entity.index,
            time_index=entity.time_index,
            variable_types=entity.variable_types,
            secondary_time_index=entity.secondary_time_index)

    for rel in make_es.relationships:
        es.add_relationship(
            ft.Relationship(es[rel.parent_entity.id][rel.parent_variable.id],
                            es[rel.child_entity.id][rel.child_variable.id]))
    return es
コード例 #30
0
 def _to_entityset(self, dataset):
     es = ft.EntitySet()
     for table_name, df in dataset.tables.items():
         if len(df.columns) == 1:
             continue  # skipping single column tables
         table = dataset.metadata.get_table(table_name)
         primary_key = table[
             "primary_key"] if "primary_key" in table else None
         if isinstance(primary_key, str):
             es = es.entity_from_dataframe(entity_id=table_name,
                                           dataframe=df.copy(),
                                           index=primary_key)
         else:
             es = es.entity_from_dataframe(entity_id=table_name,
                                           dataframe=df.copy(),
                                           make_index=True,
                                           index="_ft_id")
             if not primary_key:
                 logger.warning("Table %s has no primary key.", table_name)
             else:
                 logger.warning(
                     "Table %s has a composite primary key, it will be ignored.",
                     table_name)
     for foreign_key in dataset.metadata.get_foreign_keys():
         if foreign_key["table"] not in es.entity_dict:
             continue
         if foreign_key["ref_table"] not in es.entity_dict:
             continue
         if not isinstance(foreign_key["ref_field"], str):
             logger.warning(
                 "Tables %s and %s have a composite foreign key, it will be ignored.",
                 foreign_key["ref_table"], foreign_key["table"])
             continue
         try:
             es = es.add_relationship(
                 ft.Relationship(
                     es[foreign_key["ref_table"]][foreign_key["ref_field"]],
                     es[foreign_key["table"]][foreign_key["field"]]))
         except ValueError as err:
             logger.warning(err)
     return es