Beispiel #1
0
def valid_dfs(es, aggregations, transforms, feature_substrings,
              target_entity='log', multi_output=False, max_depth=3,
              max_features=-1, instance_ids=[0, 1, 2, 3]):
    if not isinstance(feature_substrings, list):
        feature_substrings = [feature_substrings]

    features = dfs(entityset=es, target_entity=target_entity,
                   agg_primitives=aggregations,
                   trans_primitives=transforms,
                   max_features=max_features,
                   max_depth=max_depth, features_only=True)

    applicable_features = []
    for feat in features:
        for x in feature_substrings:
            if x in feat.get_name():
                applicable_features.append(feat)
    if len(applicable_features) == 0:
        raise ValueError('No feature names with %s, verify the name attribute \
                          is defined and/or generate_name() is defined to \
                          return %s ' % (feature_substrings, feature_substrings))
    df = ft.calculate_feature_matrix(entityset=es,
                                     features=applicable_features,
                                     instance_ids=instance_ids)

    ft.encode_features(df, applicable_features)

    # TODO: check the multi_output shape by checking
    # feature.number_output_features for each feature
    # and comparing it with the matrix shape
    if not multi_output:
        assert len(applicable_features) == df.shape[1]
    return
 def feature_encoder(feature_matrix,
                     features,
                     top_n=10,
                     include_unknown=True,
                     to_encode=None,
                     inplace=False,
                     drop_first=False,
                     verbose=False):
     """
     :param feature_matrix:  df 格式的特征
     :param features:   特征矩阵的特征定义
     :param top_n:   包含的tokn的数目, int 或 使用字典, key为特征的名字,value对敌营top的数目,默认为10
     :param include_unknown: #默认为True,添加编码未知类的特性。默认为true
     :param to_encode: #要编码的特征名字列表 ["name1","name2"]
     :param inplace: 默认为False, 是否替代
     :param drop_first: 是否通过去除第一级从k个分类级别获取k-1个。默认为false
     :param verbose: 打印进度信息,
     :return:
     """
     fm_encoded, f_encoded = ft.encode_features(
         feature_matrix=feature_matrix,
         features=features,
         top_n=top_n,
         include_unknown=include_unknown,
         to_encode=to_encode,
         inplace=inplace,
         drop_first=drop_first,
         verbose=verbose)
     return fm_encoded, f_encoded
Beispiel #3
0
    def generate_feature_matrix(self, es, target, cutoff, verbose=True):
        """Calculates a feature matrix and features given in Featurization object.
            Args:
            es: A featuretools entityset that holds injested data.
            target: A string of the target entity name.
            cutoff: A pandas dataframe that indicates cutoff_time for each instance.
            verbose: A boolean indicator of verbose option.
            Returns:
            A pandas dataframe of the calculated matrix.
            """

        feature_matrix, features_defs = ft.dfs(
            entityset=es,
            target_entity=target,
            agg_primitives=self.agg_prim(),
            trans_primitives=self.trans_prim(),
            cutoff_time=cutoff,
            n_jobs=self.n_jobs(),
            max_depth=self.max_depth(),
            verbose=verbose)

        # encode categorical values
        fm_encoded, features_encoded = ft.encode_features(
            feature_matrix, features_defs)

        return fm_encoded, features_encoded
Beispiel #4
0
def load_train_data():
    print('Loading CSV data...')
    applications_df = pd.read_csv(C_PATH + 'application_train.csv')
    previous_df = pd.read_csv(C_PATH + 'previous_application.csv')
    # bureau_df = pd.read_csv(C_PATH + 'bureau.csv')

    print("Creating entityset...")
    es = ft.EntitySet(id="home-credit")

    print("Loading applications entity...")
    es = es.entity_from_dataframe(entity_id="applications",
                                  dataframe=applications_df,
                                  index="SK_ID_CURR")
    print("Loading previous entity...")
    es = es.entity_from_dataframe(entity_id="previous",
                                  dataframe=previous_df,
                                  index="SK_ID_PREV")
    # print("Loading bureau data...")
    # es = es.entity_from_dataframe(entity_id="bureau", dataframe=bureau_df, index="SK_ID_BUREAU")

    print("Adding relationships...")
    applications_previous = ft.Relationship(es["applications"]["SK_ID_CURR"],
                                            es["previous"]["SK_ID_CURR"])
    es = es.add_relationship(applications_previous)
    # applications_bureau = ft.Relationship(es["applications"]["SK_ID_CURR"], es["bureau"]["SK_ID_CURR"])
    # es = es.add_relationship(applications_bureau)

    # return es

    print("Generating DFS...")
    feature_matrix, feature_defs = ft.dfs(entityset=es,
                                          target_entity="applications",
                                          verbose=True)
    fm_encoded, defs_encoded = ft.encode_features(feature_matrix, feature_defs)
    return fm_encoded, defs_encoded
Beispiel #5
0
    def generate_feature_matrix(self, es, target, cutoff, verbose=True):
        """Calculates a feature matrix and features given in Featurization object.

          Args:
            es (featuretools.EntitySet):
              An already initialized entityset. Required if entities and relationships
              are not defined.
            target (str):
              A string of the target entity name.
            cutoff (pandas.DataFrame):
              Specified times at which to calculate the features for each instance.
            verbose (bool):
              An indicator of verbose option.

          Returns:
            pandas.DataFrame, list:
              * The generated feature matrix.
              * List of feature definitions in the feature matrix.
        """

        feature_matrix, features_defs = ft.dfs(
            entityset=es,
            target_entity=target,
            agg_primitives=self.agg_prim(),
            trans_primitives=self.trans_prim(),
            cutoff_time=cutoff,
            n_jobs=self.n_jobs(),
            max_depth=self.max_depth(),
            verbose=verbose)

        # encode categorical values
        fm_encoded, features_encoded = ft.encode_features(
            feature_matrix, features_defs)

        return fm_encoded, features_encoded
Beispiel #6
0
def test_retail_binary(ftens_file='retail_binary_files/ftens.csv',
                       labels_file='retail_binary_files/labels.csv',
                       fl_file='retail_binary_files/fl.p'):
    ftens, labels, fl = construct_retail_example(ftens_file, labels_file,
                                                 fl_file)
    baseline_ftens = (ftens.reset_index(
        'customer_id',
        drop=False).drop_duplicates('customer_id',
                                    keep='last').set_index('customer_id'))
    baseline_ftens, baseline_fl = ft.encode_features(baseline_ftens, fl)
    baseline_ftens, baseline_fl = remove_low_information_features(
        baseline_ftens, baseline_fl)
    train_customers, test_customers = train_test_split(
        baseline_ftens.index.values, shuffle=True, test_size=0.1)
    train_labels = labels.loc[train_customers]
    test_labels = labels.loc[test_customers]
    train_ftens = ftens.loc[(train_customers, slice(None)), :]
    test_ftens = ftens.loc[(test_customers, slice(None)), :]
    baseline_train_fm = baseline_ftens.loc[train_customers, :]
    baseline_test_fm = baseline_ftens.loc[test_customers, :]

    dl_model = DLDB(regression=False,
                    classes=[False, True],
                    recurrent_layer_sizes=(32, ),
                    dense_layer_sizes=(32, 32),
                    categorical_max_vocab=10)
    dl_model.fit(train_ftens, train_labels, fl=fl, epochs=1, batch_size=32)
    predictions = dl_model.predict(test_ftens)
    score = roc_auc_score(test_labels, predictions)

    baseline_scores = score_baseline_pipeline(baseline_train_fm, train_labels,
                                              baseline_test_fm, test_labels)
    return score, baseline_scores
Beispiel #7
0
Datei: T.py Projekt: lokcyi/AI
def autoFeatureEngineering(es, target_entityName):
    from featuretools.selection import remove_low_information_features

    fm, features = ft.dfs(
        entityset=es,
        target_entity=target_entityName,
        #agg_primitives=['Sum', 'Mean', 'Percent_True'],
        trans_primitives=['divide_numeric',
                          'multiply_numeric'],  #'add_numeric',
        # trans_primitives=['Hour'],
        max_depth=1,
        # approximate='2m',
        #cutoff_time=cutoff_times[1000:],
        ignore_variables={'toolgkpi': ['MFG_DATE', targetColumn]},
        verbose=True)

    # -------------------------用one_hot编码特征矩阵-------------------------
    fm_enc, f_enc = ft.encode_features(fm, features)
    #print("用one_hot编码特征矩阵 Number of features %s" % len(fm_enc))
    # -------------------------na 用0取代-------------------------
    fm_enc = fm_enc.fillna(0)
    #print("fillna Number of features %s" % len(fm_enc))
    # -------------------------移除较小信息的特征-------------------------
    fm_enc = remove_low_information_features(fm_enc)
    # print("移除较小信息的特征 Number of features %s" % len(fm_enc))
    # -------------------------

    # feature = feature_names[14]
    # ft.graph_feature(feature)
    # ft.describe_feature(feature)
    fm_enc.replace([np.inf, -np.inf], np.nan)  # np.inf都用np.nan代替
    print(fm_enc.isnull().sum())
    # print(fm_enc,f_enc)
    print(fm_enc.columns)
    return fm_enc
Beispiel #8
0
    def _fit_and_return_result(self,
                               *,
                               timeout: float = None,
                               iterations: int = None):

        if self._entityset is None:
            raise ValueError(
                'Must call .set_training_data() before calling .fit()')

        ignore_variables = {self._target_entity: [self._target]}
        time_index = self._entityset[self._target_entity].time_index
        index = self._entityset[self._target_entity].index
        cutoff_time = None
        if time_index:
            target_df = self._entityset[self._target_entity].df
            cutoff_time = target_df[[index, time_index]]
            ignore_variables = None

        features_only = (not self.hyperparams['encode']
                         and not self.hyperparams['remove_low_information'])

        agg_primitives = [
            name[12:] for name, value in self.hyperparams.items()
            if name.startswith('aggregation_') and value
        ]
        trans_primitives = [
            name[10:] for name, value in self.hyperparams.items()
            if name.startswith('transform_') and value
        ]

        res = ft.dfs(entityset=self._entityset,
                     target_entity=self._target_entity,
                     cutoff_time=cutoff_time,
                     cutoff_time_in_index=False,
                     features_only=features_only,
                     ignore_variables=ignore_variables,
                     max_depth=self.hyperparams['max_depth'],
                     agg_primitives=agg_primitives,
                     trans_primitives=trans_primitives)

        if not features_only:
            if self.hyperparams['encode']:
                fm, self._features = ft.encode_features(
                    *res,
                    top_n=self.hyperparams['top_n'],
                    include_unknown=self.hyperparams['include_unknown'])

            if self.hyperparams['remove_low_information']:
                fm, self._features = remove_low_information_features(
                    fm, self._features)

            self._fitted = True

            return fm

        else:
            self._fitted = True
            self._features = res
Beispiel #9
0
    def dfs(self,
            X=None,
            target_entity=None,
            entityset=None,
            entities=None,
            relationships=None):
        if not entities and not entityset:
            target_entity = 'X'
        else:
            target_entity = target_entity or self.target_entity

        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities,
                                            relationships)

        if self.training_window is not None:
            entityset.add_last_time_indexes()

        cutoff_time = None
        if self.time_index:
            cutoff_time = X[[self.index, self.time_index]]

        self.features = ft.dfs(
            cutoff_time=cutoff_time,
            max_depth=self.max_depth,
            entityset=entityset,
            target_entity=target_entity,
            features_only=True,
            agg_primitives=self.agg_primitives,
            trans_primitives=self.trans_primitives,
            max_features=self.max_features,
            training_window=self.training_window,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
        )

        if self.encode or self.remove_low_information:
            X = ft.calculate_feature_matrix(
                self.features,
                entityset=entityset,
                cutoff_time=cutoff_time,
                training_window=self.training_window,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )

            if self.encode:
                X, self.features = ft.encode_features(X, self.features)

            if self.remove_low_information:
                X, self.features = remove_low_information_features(
                    X, self.features)
Beispiel #10
0
def compute_features(features, cutoff_time):
    # shuffle so we don't see encoded features in the front or backs

    np.random.shuffle(features)
    feature_matrix = ft.calculate_feature_matrix(features,
                                                 cutoff_time=cutoff_time,
                                                 approximate='36d',
                                                 verbose=True)
    print "Finishing computing..."
    feature_matrix, features = ft.encode_features(feature_matrix, features,
                                                  to_encode=["pickup_neighborhood", "dropoff_neighborhood"],
                                                  include_unknown=False)
    return feature_matrix
def create_features(es, label='Outcome', custom_agg=[]):
    cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', label]]
    fm, features = ft.dfs(entityset=es,
                          target_entity='transactions',
                          agg_primitives=[Sum, Mean] + custom_agg,
                          trans_primitives=[Hour],
                          max_depth=3,
                          approximate='2m',
                          cutoff_time=cutoff_times,
                          verbose=True)
    fm_enc, _ = ft.encode_features(fm, features)
    fm_enc = fm_enc.fillna(0)
    fm_enc = remove_low_information_features(fm_enc)
    labels = fm.pop(label)
    return (fm_enc, labels)
Beispiel #12
0
def calculate_feature_matrix(es, target_entity, trans_primitives,
                             agg_primitives, max_depth):

    feature_matrix, features = ft.dfs(entityset=es,
                                      target_entity=target_entity,
                                      trans_primitives=trans_primitives,
                                      agg_primitives=agg_primitives,
                                      max_depth=max_depth,
                                      verbose=True)

    print("{} features generated".format(len(features)))

    fm_encoded, features_encoded = ft.encode_features(feature_matrix, features)
    fm_encoded = fm_encoded.fillna(0)

    return fm_encoded, features_encoded
    def generate_features(self):
        """ 06. Run deep feature synthesis .  """

        # Create new features using specified primitives
        self.feature_matrix, self.feature_defs = ft.dfs(
            entityset=self.es,
            target_entity='users',
            trans_primitives=self.trans_primitives,
            agg_primitives=self.agg_primitives,
            verbose=1,
            max_depth=self.max_feature_depth)

        # encode at
        self.feature_matrix_enc, self.features_enc = ft.encode_features(
            self.feature_matrix, self.feature_defs)
        self.next(self.split_training_data)
Beispiel #14
0
def get_train_data(project,
                   train_file,
                   prediction_key,
                   prediction_target,
                   variable_types={},
                   drop_columns=None):

    # Read the training data
    print("==========Reading the training file {}".format(train_file))
    train_data = pd.read_csv(train_file)
    train_data.head(5)

    print("==========Preparing training labels for target {}".format(
        prediction_target))
    train_labels = train_data[prediction_target].values
    train_data = train_data.drop(prediction_target, axis=1)

    if drop_columns is not None:
        print("==========dropping columns {}".format(drop_columns))
        train_data = train_data.drop(drop_columns, axis=1)

    print("==========Generating the feature with featuretools")

    es = ft.EntitySet(project)

    entities = get_ft_entities(es=es,
                               project=project,
                               prediction_key=prediction_key,
                               data=train_data,
                               variable_types=variable_types)

    print("==========entities are:")
    print(entities)

    feature_matrix, feature_defs = ft.dfs(entityset=entities,
                                          target_entity=project)

    feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)
    print("==========columns are:")
    print(feature_matrix_enc.columns)

    print("==========saving features to {}".format(project))
    ft.save_features(feature_defs, "data/{}/ft_features".format(project))

    return feature_matrix_enc, train_labels
Beispiel #15
0
def build_card_one_hot():
    """ Reads in the raw data from train.csv and creates
        one-hot encodings for the feature and date fields.

        :return: Data frame with one-hot encoding
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in data.")
    df = pd.read_csv('data/raw/train.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")

    logger.info("Creating entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=df,
                                              index='card_id',
                                              time_index="first_active_month",
                                              variable_types=CARD_TYPES)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity="transactions")

    logger.info("Creating one-hot training data")
    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Creating one-hot test data")
    df = pd.read_csv('data/raw/test.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")
    df['target'] = 0
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=df,
                                            index='card_id',
                                            time_index="first_active_month",
                                            variable_types=CARD_TYPES)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)
    test_feature_matrix_enc.drop(columns='target', inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
Beispiel #16
0
 def get_final_data(or_df: pd.DataFrame, features_def, **kwds):
     '''
     check the data types,only support numeric/categorical/boolean,return numeric data.
     1.drop unsupport cols
     2.encode categorical/boolean cols
     '''
     # drop un numeric/categorical cols
     unnum = ['bool', 'category']
     numeric_and_boolean_dtypes = vtypes.PandasTypes._pandas_numerics + unnum
     clean_df = or_df.select_dtypes(include=numeric_and_boolean_dtypes)
     unuse_col = set(or_df.columns) - set(clean_df.columns)
     features_def = [
         item for item in features_def if item.get_name() not in unuse_col
     ]
     warnings.warn(f'{unuse_col} columns will be dropped because the dtype')
     # categorical/boolean will be encode to number by one-hot;
     clean_df, features_def = ft.encode_features(clean_df, features_def,
                                                 **kwds)
     return clean_df, features_def
Beispiel #17
0
    def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None):
        if not entities and not entityset:
            target_entity = 'X'
        else:
            target_entity = target_entity or self.target_entity

        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities, relationships)

        instance_ids = None
        cutoff_time = None
        if self.time_index:
            cutoff_time = X[[self.index, self.time_index]]
        elif self.index:
            instance_ids = X[self.index]
        else:
            instance_ids = X.index.values

        self.features = ft.dfs(
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
            max_depth=self.max_depth,
            entityset=entityset,
            target_entity=target_entity,
            features_only=True,
            agg_primitives=self.agg_primitives,
            trans_primitives=self.trans_primitives
        )

        X = ft.calculate_feature_matrix(
            self.features,
            entityset=entityset,
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
        )

        if self.encode:
            X, self.features = ft.encode_features(X, self.features)

        if self.remove_low_information:
            X, self.features = remove_low_information_features(X, self.features)
Beispiel #18
0
    def produce(self,
                X,
                instance_ids=None,
                include_unknown=True,
                remove_low_information=True,
                **kwargs):

        if instance_ids is not None:
            feature_matrix = ft.calculate_feature_matrix(
                self.features, instance_ids=instance_ids, **kwargs)

            feature_matrix = (feature_matrix.reset_index('time').loc[
                instance_ids, :].set_index('time', append=True))

        else:
            feature_matrix = ft.calculate_feature_matrix(self.features,
                                                         cutoff_time=X,
                                                         **kwargs)

        for f in self.features:
            if issubclass(f.variable_type, vtypes.Discrete):
                feature_matrix[f.get_name()] = feature_matrix[
                    f.get_name()].astype(object)
            elif issubclass(f.variable_type, vtypes.Numeric):
                feature_matrix[f.get_name()] = pd.to_numeric(
                    feature_matrix[f.get_name()])
            elif issubclass(f.variable_type, vtypes.Datetime):
                feature_matrix[f.get_name()] = pd.to_datetime(
                    feature_matrix[f.get_name()])

        encoded_fm, encoded_fl = ft.encode_features(feature_matrix,
                                                    self.features)

        if remove_low_information:
            encoded_fm, encoded_fl = remove_low_information_features(
                encoded_fm, encoded_fl)

        encoded_fm.reset_index('time', drop=True, inplace=True)

        return encoded_fm.fillna(0)
Beispiel #19
0
def get_test_data(project,
                  testfile,
                  prediction_key,
                  prediction_target,
                  variable_types={},
                  drop_columns=None):

    print("==========Reading test data file {}".format(testfile))
    test_data = pd.read_csv(testfile)
    print(test_data.describe())

    if drop_columns is not None:
        print("==========dropping columns {}".format(drop_columns))
        test_data = test_data.drop(drop_columns, axis=1)

    es = ft.EntitySet(project)

    entities = get_ft_entities(es=es,
                               project=project,
                               prediction_key=prediction_key,
                               data=test_data,
                               variable_types=variable_types)

    print("==========entities are:")
    print(entities)

    print("==========Reading features from {}".format(project))
    saved_features = ft.load_features("data/{}/ft_features".format(project))

    print("==========saved_features are:")
    print(saved_features)

    feature_matrix = ft.calculate_feature_matrix(saved_features, entities)

    feature_matrix_enc, _ = ft.encode_features(feature_matrix, saved_features)

    index_column = test_data[prediction_key]

    return feature_matrix_enc, index_column
    def compute_features(self, df, cutoff_strategy, feature_window):
        assert cutoff_strategy.entity_col == self.entity_col

        cutoffs = cutoff_strategy.generate_cutoffs(df)

        cutoffs_ft = []

        for _id, row in cutoffs.iterrows():
            cutoffs_ft.append((row[self.entity_col], row['cutoff_st'] - timedelta(days=1)))

        cutoffs_ft = pd.DataFrame(cutoffs_ft, columns=['instance_id', 'time'])

        feature_matrix, features = ft.dfs(target_entity=self.entity_col,
                                          cutoff_time=cutoffs_ft,
                                          training_window="%dday" % feature_window,  # same as above
                                          entityset=self.es,
                                          cutoff_time_in_index=True,
                                          verbose=True)
        # encode categorical values
        fm_encoded, features_encoded = ft.encode_features(feature_matrix,
                                                          features)

        self.features = fm_encoded.fillna(0)
Beispiel #21
0
    def dfs(self,
            X=None,
            target_entity='X',
            entityset=None,
            entities=None,
            relationships=None):
        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities,
                                            relationships)

        target = entityset[target_entity]
        time_index = target.time_index
        index = target.index

        cutoff_time = None
        if time_index:
            cutoff_time = target.df[[index, time_index]]

        instance_ids = X[index].values.copy()

        self.features = ft.dfs(cutoff_time=cutoff_time,
                               max_depth=self.max_depth,
                               entityset=entityset,
                               target_entity=target_entity,
                               features_only=True,
                               instance_ids=instance_ids)

        X = ft.calculate_feature_matrix(self.features,
                                        entityset=entityset,
                                        instance_ids=instance_ids)

        if self.encode:
            X, self.features = ft.encode_features(X, self.features)

        if self.remove_low_information:
            X, self.features = remove_low_information_features(
                X, self.features)
        "GarageFinish": ft.variable_types.Categorical,
        "GarageQual": ft.variable_types.Categorical,
        "GarageCond": ft.variable_types.Categorical,
        "PavedDrive": ft.variable_types.Categorical,
        "PoolQC": ft.variable_types.Categorical,
        "Fence": ft.variable_types.Categorical,
        "MiscFeature": ft.variable_types.Categorical,
        "MoSold": ft.variable_types.Categorical,
        "YrSold": ft.variable_types.Categorical,
        "SaleType": ft.variable_types.Categorical,
        "SaleCondition": ft.variable_types.Categorical
    })

#The training Set
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="HousingSet")
fm_encoded, features_encoded = ft.encode_features(feature_matrix, feature_defs)
#Need to normalize the housing prices
house_prices = np.log1p(house_prices)

#Lets see labels for each
X, y = fm_encoded, house_prices

###Now testing set
feature_matrix_test, feature_defs_test = ft.dfs(entityset=es,
                                                target_entity="HousingTest")
fm_encoded_test, features_encoded_test = ft.encode_features(
    feature_matrix_test, feature_defs_test)
Actual_test = fm_encoded_test

##Fixing the alignment
X, Actual_test = X.align(Actual_test, join='left', axis=1, fill_value=0)
Beispiel #23
0
def build_transaction_data():
    """ Builds a data set from raw card and transaction data
        using the featuretools package.

        The resulting data set will be strictly concerned
        with transactions shown in the historical transactions CSV,
        and linking them to the proper card.

        :return:    training, testing feature matrices
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in card data")
    customer_df = pd.read_csv("data/raw/train.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    customer_df.drop(columns='target', inplace=True)

    logger.info("Reading in transactions")
    transactions_df = pd.read_csv("data/raw/historical_transactions.csv",
                                  dtype=TRANSACTION_LOAD_DTYPES)
    transactions_df['authorized_flag'] = np.where(
        transactions_df['authorized_flag'] == 'Y', 1, 0)
    transactions_df.reset_index(inplace=True)

    logger.info("Creating training entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='customer',
                                              dataframe=customer_df,
                                              index='card_id',
                                              time_index='first_active_month',
                                              variable_types=CARD_TYPES)

    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=transactions_df,
                                              index='index',
                                              variable_types=TRANSACTION_TYPES)

    del customer_df
    gc.collect()

    logger.info("Defining relationships")
    relationship = ft.Relationship(es_train['customer']['card_id'],
                                   es_train['transactions']['card_id'])

    es_train = es_train.add_relationship(relationship)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity='customer')

    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Loading test data")
    customer_df = pd.read_csv("data/raw/test.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    logger.info("Creating testing entity set")
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='customer',
                                            dataframe=customer_df,
                                            index='card_id',
                                            time_index='first_active_month',
                                            variable_types=CARD_TYPES)

    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=transactions_df,
                                            index='index',
                                            variable_types=TRANSACTION_TYPES)

    es_test = es_test.add_relationship(relationship)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)

    for col in train_feature_matrix_enc.columns:
        logger.debug(f"Normalizing feature [{col}]")
        old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max'])

        if (old_min == old_max):
            logger.debug(f"Droping feature [{col}] due to lack of variation")
            train_feature_matrix_enc.drop(columns=col, inplace=True)
            test_feature_matrix_enc.drop(columns=col, inplace=True)

            continue

        train_feature_matrix_enc[col] = normalize_series(
            series=train_feature_matrix_enc[col], min_max=(old_min, old_max))

        assert col in test_feature_matrix_enc.columns

        test_feature_matrix_enc[col] = normalize_series(
            series=test_feature_matrix_enc[col], min_max=(old_min, old_max))

    logger.info("Dropping SKEW features.")
    # TODO: Determine why these have lower counts than other features
    drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c]
    train_feature_matrix_enc.drop(columns=drop_cols, inplace=True)
    test_feature_matrix_enc.drop(columns=drop_cols, inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
Beispiel #24
0
    def generate_feature_matrix(self,
                                es,
                                target,
                                label_times,
                                instance_ids=None,
                                agg_primitives=AGG_PRIMITIVES,
                                trans_primitives=TRANS_PRIMITIVES,
                                max_depth=2,
                                ignore_entities=None,
                                ignore_variables=None,
                                seed_features=None,
                                drop_contains=None,
                                drop_exact=None,
                                max_features=-1,
                                training_window=None,
                                n_jobs=1,
                                verbose=False,
                                include_cutoff_time=True,
                                encode=False):
        """Calculates a feature matrix and features given in Featurization object.

        Args:
            es (featuretools.EntitySet):
                An already initialized entityset.
            target (str):
                Name of the entity (entity id) on which to make predictions.
            label_times (pandas.DataFrame):
                A data frame that specifies the times at which to calculate the features
                for each instance. This data frame contains three columns ``instance_id``,
                ``time``, ``label``. The ``instance_id`` specifies the instances for
                which to calculate features over. The ``time`` column specifies the cutoff
                time for each instance. Data before the cutoff time will be used for
                calculating the feature matrix. The ``label`` column specifies the ground
                truth label (value we want to predict) for each instance.
            instance_ids (list):
                List of instances on which to calculate features.
            agg_primitives (list):
                List of Aggregation Feature types to apply.
            trans_primitives (list):
                List of Transform Feature functions to apply.
            max_depth (int):
                Maximum allowed depth of features.
            ignore_entities (list):
                List of entities to blacklist when creating features.
            ignore_variables (dict):
                List of specific variables within each entity to blacklist when creating features.
            seed_features (list):
                List of manually defined features to use.
            drop_contains (list):
                Drop features that contains these strings in name.
            drop_exact (list):
                Drop features that exactly match these strings in name.
            max_features (int):
                Cap the number of generated features to this number. If -1, no limit.
            training_window (ft.Timedelta or str):
                Window defining how much time before the cutoff time data can be used when c
                alculating features. If ``None``, all data before cutoff time is used.
                Defaults to ``None``. Month and year units are not relative when Pandas
                Timedeltas are used. Relative units should be passed as a Featuretools
                Timedelta or a string.
            n_jobs (int):
                Number of parallel processes to use when calculating feature matrix.
            verbose (bool):
                An indicator of verbose option.
            include_cutoff_time (bool):
                Include data at cutoff times in feature calculations. Defaults to ``True``.
            encode (bool):
                Whether or not to encode categorical into one-hot features.

        Returns:
            pandas.DataFrame, list:
                * The generated feature matrix.
                * List of feature definitions in the feature matrix.
        """

        feature_matrix, features_defs = ft.dfs(
            entityset=es,
            target_entity=target,
            cutoff_time=label_times,
            instance_ids=instance_ids,
            agg_primitives=agg_primitives,
            trans_primitives=trans_primitives,
            max_depth=max_depth,
            ignore_entities=ignore_entities,
            ignore_variables=ignore_variables,
            seed_features=seed_features,
            drop_contains=drop_contains,
            drop_exact=drop_exact,
            max_features=max_features,
            training_window=training_window,
            n_jobs=n_jobs,
            verbose=verbose,
            include_cutoff_time=include_cutoff_time)

        if encode:
            # encode categorical values
            return ft.encode_features(feature_matrix, features_defs)

        return feature_matrix, features_defs
es['transactions']['date_of_birth'].interesting_values = [
    '1986-08-18', '1986-08-19'
]  #'where_primitives' to specify agg primitives in agg_primitives
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity='products',
    where_primitives=['count'],
    agg_primitives=['count', 'mean'
                    ],  # specified, otherwise defaults primitives will be used
    max_depth=1)
print(feature_matrix.columns.tolist())
print(feature_matrix.head())
print(feature_defs)

print('-----------encode category feature-----------')
feature_matrix_enc, feature_enc = ft.encode_features(feature_matrix,
                                                     feature_defs)
print(feature_matrix_enc.columns.tolist())
print(feature_matrix_enc.head())
print(feature_enc)

print('-----------list primitives---------------------')
print(ft.list_primitives().head())

print('----------custom primitives----------------------')
from featuretools.primitives import make_agg_primitive, make_trans_primitive
from featuretools.variable_types import Text, Numeric


def absolute(column):
    return abs(column)
def gen_feature_matrix(entityset,
                       features_only=False,
                       feature_matrix_encode=False,
                       saved_features=None):
    '''A function compute and return (feature_matrix, feature_defs) from an featuretools EntitySet

    entityset: the EntitySet to compute features from
    features_only: only return feature_defs, do not actually compute the feature_matrix
    feature_matrix_encode: whether return encoded feature_matrix (Categorical variable one-hot)
    saved_features: load a pre defined feature file and compute feature_matrix based on it
    '''

    if 'goldstandard' in entityset.entity_dict.keys():
        goldstandard_exist = True
        goldstandard_id = 'goldstandard'
    else:
        goldstandard_exist = False
        goldstandard_id = None
    ##FIX manual partition by person_id does NOT improve Dask computing performance
    # ignore 'partition' columns in every entity when building features
    # ignore_variables = dict()
    # for entity in entityset.entities:
    #     if 'partition' in [v.name for v in entity.variables]:
    #         ignore_variables[entity.id] = ['partition']

    ##CAUTION when the entityset is backed by Dask dataframes, only limited set of primitives are supported
    # agg_primitives_all=['avg_time_between', 'count', 'all', 'entropy', 'last', 'num_unique', 'n_most_common',
    #             'min', 'std', 'median', 'mean', 'percent_true', 'trend', 'sum', 'time_since_last', 'any',
    #             'num_true', 'time_since_first', 'first', 'max', 'mode', 'skew']
    # agg_primitives_dask=['count', 'all', 'num_unique', #'n_most_common',
    #               'min', 'std', 'mean', 'percent_true', 'sum', 'any',
    #               'num_true', 'max']

    ## define features per entity(table)
    agg_primitives = [
        'mean', 'max', 'min', 'std', 'last', 'skew', 'time_since_last'
    ]  # 'trend' # trend takes extremely long time to compute
    include_variables = {
        'measurement':
        ['measurement_datetime', 'value_as_number', 'measurement_concept_id'],
        'observation':
        ['observation_concept_id', 'observation_datetime', 'value_as_number']
    }
    agg_primitives_device_exposure = [
        'count', 'avg_time_between', 'time_since_first'
    ]
    include_entities_device_exposure = ['device_exposure']

    trans_primitives = ['age']
    groupby_trans_primitives = []
    include_entities = ['person']
    primitive_options = {
        tuple(trans_primitives): {
            'include_entities': include_entities
        },
        tuple(agg_primitives): {
            'include_variables': include_variables
        },
        tuple(agg_primitives_device_exposure): {
            'include_entities': include_entities_device_exposure
        },
    }
    ignore_entities = [
        goldstandard_id, 'condition_occurrence', 'drug_exposure',
        'observation_period', 'procedure_occurrence', 'visit_occurrence'
    ]
    ignore_variables = {}
    where_primitives = agg_primitives
    entityset['measurement'][
        'measurement_concept_id'].interesting_values = entityset[
            'measurement'].df['measurement_concept_id'].unique()
    entityset['observation'][
        'observation_concept_id'].interesting_values = entityset[
            'observation'].df['observation_concept_id'].unique()
    # if isinstance(entityset.entities[0].df, pandas.DataFrame):
    #     agg_primitives = agg_primitives_all
    # else:
    #     agg_primitives = agg_primitives_dask

    # build features
    if saved_features is None:
        with yaspin(color="yellow") as spinner:
            spinner.write(
                "No features definition file specified, calculating feature matrix from ground zero ... "
            )
            feature_defs = ft.dfs(
                entityset=entityset,
                target_entity="person",
                features_only=True,
                agg_primitives=agg_primitives + agg_primitives_device_exposure,
                trans_primitives=trans_primitives,
                groupby_trans_primitives=groupby_trans_primitives,
                primitive_options=primitive_options,
                ignore_entities=ignore_entities,
                ignore_variables=ignore_variables,
                where_primitives=where_primitives,
                max_depth=2)
            spinner.write("> generated {} features".format(len(feature_defs)))
            if features_only:
                return feature_defs

            tic = time.perf_counter()
            feature_matrix = ft.calculate_feature_matrix(
                feature_defs, entityset)
            if isinstance(entityset.entities[0].df, dd.DataFrame):
                feature_matrix = feature_matrix.compute()
            toc = time.perf_counter()
            spinner.write(
                f"> feature matrix calculate completed in {toc - tic:0.4f} seconds"
            )
            if feature_matrix_encode:
                feature_matrix_enc, features_enc = ft.encode_features(
                    feature_matrix, feature_defs)
                spinner.write(
                    "> generated {} encoded features and the feature matrix".
                    format(len(features_enc)))
            spinner.ok("Done")
    else:
        with yaspin(color="yellow") as spinner:
            spinner.write(
                "Using saved features from {} ... ".format(saved_features))
            feature_defs = ft.load_features(saved_features)
            spinner.write("> {} features loaded from {}".format(
                len(feature_defs), saved_features))

            tic = time.perf_counter()
            feature_matrix = ft.calculate_feature_matrix(
                feature_defs, entityset)
            if isinstance(entityset.entities[0].df, dd.DataFrame):
                feature_matrix = feature_matrix.compute()
            toc = time.perf_counter()
            spinner.write(
                f"> feature matrix calculate complete in {toc - tic:0.4f} seconds"
            )
            spinner.ok("Done")

    if goldstandard_exist:
        if isinstance(entityset.entities[0].df, dd.DataFrame):
            goldstandard = entityset['goldstandard'].df.compute()
        else:
            goldstandard = entityset['goldstandard'].df
    if feature_matrix_encode:
        feature_matrix = feature_matrix_enc
    if goldstandard_exist:
        feature_matrix = feature_matrix.merge(goldstandard,
                                              on='person_id',
                                              how='right')

    return feature_matrix, feature_defs
Beispiel #27
0
import featuretools as ft
import pandas as pd
import utils, os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

es = utils.load_entityset("./featuretools_part_1/")
print(es)
label_times = utils.make_labels(es=es,
                                product_name="Banana",
                                cutoff_time=pd.Timestamp('March 15, 2015'),
                                prediction_window=ft.Timedelta("4 weeks"),
                                training_window=ft.Timedelta("60 days"))

feature_matrix, features = ft.dfs(
    target_entity="users",
    cutoff_time=label_times,
    training_window=ft.Timedelta("60 days"),  # same as above
    entityset=es,
    verbose=True)

# Encode categorical values
fm_encoded, features_encoded = ft.encode_features(feature_matrix, features)

print("Number of features %s" % len(features_encoded))
print(features_encoded)

# Sample the feature by user input

# Train the classifier
Beispiel #28
0
    def run_featuretools(self,
                         read_in_data_if_needed=True,
                         export_to_csv=False):

        # TODO: This should eventually be dynamic.
        dataset_filenames = ['POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv',\
        'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv']

        if self.datasets == []:
            self.read_all_data(dataset_filenames=dataset_filenames)
        for data in self.datasets:
            if data.name == 'POS_CASH_balance':
                pos = data.data
            elif data.name == 'application_test':
                test = data.data
            elif data.name == 'application_train':
                train_full = data.data
            elif data.name == 'bureau':
                bureau = data.data
            elif data.name == 'bureau_balance':
                bureau_balance = data.data
            elif data.name == 'credit_card_balance':
                cc_bal = data.data
            elif data.name == 'installments_payments':
                inst = data.data
            elif data.name == 'previous_application':
                prev_app = data.data

        train = train_full.drop('TARGET', axis=1)
        train_y = train_full['TARGET']

        print('Creating entity set.')

        # Create new entityset
        es = ft.EntitySet(id='train')
        print('Creating train entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='train',
                                      dataframe=train,
                                      index='SK_ID_CURR')
        print('Creating bureau entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='bureau',
                                      dataframe=bureau,
                                      index='SK_ID_BUREAU')
        print('Creating bureau_bal entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='bureau_bal',
                                      dataframe=bureau_balance,
                                      make_index=True,
                                      index='bureau_bal_id')
        print('Creating pos entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='pos',
                                      dataframe=pos,
                                      make_index=True,
                                      index='pos_id')
        print('Creating cc_bal entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='cc_bal',
                                      dataframe=cc_bal,
                                      make_index=True,
                                      index='cc_bal_id')
        print('Creating inst entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='inst',
                                      dataframe=inst,
                                      make_index=True,
                                      index='inst_id')
        print('Creating prev_app entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='prev_app',
                                      dataframe=prev_app,
                                      index='SK_ID_PREV')

        print('Creating relationships.')
        print(str(pd.Timestamp.now()))

        # Create relationships
        print('Creating r_train_bureau.')
        print(str(pd.Timestamp.now()))
        r_train_bureau = ft.Relationship(es['train']['SK_ID_CURR'],
                                         es['bureau']['SK_ID_CURR'])
        es = es.add_relationship(r_train_bureau)

        print('Creating r_bureau_bureau_bal.')
        print(str(pd.Timestamp.now()))
        r_bureau_bureau_bal = ft.Relationship(es['bureau']['SK_ID_BUREAU'],
                                              es['bureau_bal']['SK_ID_BUREAU'])
        es = es.add_relationship(r_bureau_bureau_bal)

        print('Creating r_train_pos.')
        print(str(pd.Timestamp.now()))
        r_train_pos = ft.Relationship(es['train']['SK_ID_CURR'],
                                      es['pos']['SK_ID_CURR'])
        es = es.add_relationship(r_train_pos)

        print('Creating r_train_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_train_cc_bal = ft.Relationship(es['train']['SK_ID_CURR'],
                                         es['cc_bal']['SK_ID_CURR'])
        es = es.add_relationship(r_train_cc_bal)

        print('Creating r_train_inst.')
        print(str(pd.Timestamp.now()))
        r_train_inst = ft.Relationship(es['train']['SK_ID_CURR'],
                                       es['inst']['SK_ID_CURR'])
        es = es.add_relationship(r_train_inst)

        print('Creating r_train_prev_app.')
        print(str(pd.Timestamp.now()))
        r_train_prev_app = ft.Relationship(es['train']['SK_ID_CURR'],
                                           es['prev_app']['SK_ID_CURR'])
        es = es.add_relationship(r_train_prev_app)

        print('Creating r_prev_app_pos.')
        print(str(pd.Timestamp.now()))
        r_prev_app_pos = ft.Relationship(es['prev_app']['SK_ID_PREV'],
                                         es['pos']['SK_ID_PREV'])
        es = es.add_relationship(r_prev_app_pos)

        print('Creating r_prev_app_inst.')
        print(str(pd.Timestamp.now()))
        r_prev_app_inst = ft.Relationship(es['prev_app']['SK_ID_PREV'],
                                          es['inst']['SK_ID_PREV'])
        es = es.add_relationship(r_prev_app_inst)

        print('Creating r_prev_app_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_prev_app_cc_bal = ft.Relationship(es['prev_app']['SK_ID_PREV'],
                                            es['cc_bal']['SK_ID_PREV'])
        es = es.add_relationship(r_prev_app_cc_bal)

        # Create new features using specified primitives
        # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html

        print('Creating actual features.')
        print(str(pd.Timestamp.now()))
        feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'train', \
        agg_primitives = ['mean', 'max', 'last']
        # trans_primitives = ['years', 'month', 'subtract', 'divide']
        )

        self.featuretools_feature_set = feature_matrix
        self.featuretools_feature_names = feature_defs

        # One hot encode categorical features
        feature_matrix_enc, feature_defs_enc = ft.encode_features(
            feature_matrix, feature_defs)

        # Create entity set for test
        print('Creating test entity')
        ts = ft.EntitySet(id='test')
        print('Creating test entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='test',
                                      dataframe=test,
                                      index='SK_ID_CURR')
        print('Creating bureau entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='bureau',
                                      dataframe=bureau,
                                      index='SK_ID_BUREAU')
        print('Creating bureau_bal entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='bureau_bal',
                                      dataframe=bureau_balance,
                                      make_index=True,
                                      index='bureau_bal_id')
        print('Creating pos entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='pos',
                                      dataframe=pos,
                                      make_index=True,
                                      index='pos_id')
        print('Creating cc_bal entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='cc_bal',
                                      dataframe=cc_bal,
                                      make_index=True,
                                      index='cc_bal_id')
        print('Creating inst entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='inst',
                                      dataframe=inst,
                                      make_index=True,
                                      index='inst_id')
        print('Creating prev_app entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='prev_app',
                                      dataframe=prev_app,
                                      index='SK_ID_PREV')

        print('Creating relationships.')
        print(str(pd.Timestamp.now()))

        # Create relationships
        print('Creating r_test_bureau.')
        print(str(pd.Timestamp.now()))
        r_test_bureau = ft.Relationship(ts['test']['SK_ID_CURR'],
                                        ts['bureau']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_bureau)

        print('Creating r_bureau_bureau_bal.')
        print(str(pd.Timestamp.now()))
        r_bureau_bureau_bal = ft.Relationship(ts['bureau']['SK_ID_BUREAU'],
                                              ts['bureau_bal']['SK_ID_BUREAU'])
        ts = ts.add_relationship(r_bureau_bureau_bal)

        print('Creating r_test_pos.')
        print(str(pd.Timestamp.now()))
        r_test_pos = ft.Relationship(ts['test']['SK_ID_CURR'],
                                     ts['pos']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_pos)

        print('Creating r_test_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_test_cc_bal = ft.Relationship(ts['test']['SK_ID_CURR'],
                                        ts['cc_bal']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_cc_bal)

        print('Creating r_test_inst.')
        print(str(pd.Timestamp.now()))
        r_test_inst = ft.Relationship(ts['test']['SK_ID_CURR'],
                                      ts['inst']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_inst)

        print('Creating r_test_prev_app.')
        print(str(pd.Timestamp.now()))
        r_test_prev_app = ft.Relationship(ts['test']['SK_ID_CURR'],
                                          ts['prev_app']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_prev_app)

        print('Creating r_prev_app_pos.')
        print(str(pd.Timestamp.now()))
        r_prev_app_pos = ft.Relationship(ts['prev_app']['SK_ID_PREV'],
                                         ts['pos']['SK_ID_PREV'])
        ts = ts.add_relationship(r_prev_app_pos)

        print('Creating r_prev_app_inst.')
        print(str(pd.Timestamp.now()))
        r_prev_app_inst = ft.Relationship(ts['prev_app']['SK_ID_PREV'],
                                          ts['inst']['SK_ID_PREV'])
        ts = ts.add_relationship(r_prev_app_inst)

        print('Creating r_prev_app_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_prev_app_cc_bal = ft.Relationship(ts['prev_app']['SK_ID_PREV'],
                                            ts['cc_bal']['SK_ID_PREV'])
        ts = ts.add_relationship(r_prev_app_cc_bal)

        # Create new features using specified primitives
        # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html

        print('Creating actual features.')
        print(str(pd.Timestamp.now()))
        feature_matrix_test = ft.calculate_feature_matrix(
            features=feature_matrix_enc, entityset='test')

        # One hot encode categorical features
        feature_matrix_test_enc, feature_defs_test_enc = ft.encode_features(
            feature_matrix_test, feature_defs)

        print('Done running featuretools!')

        print('Exporting features to CSV.')

        if export_to_csv:
            pd.DataFrame(feature_matrix_enc).to_csv('featuretools_feature.csv')
            train_y.to_csv('train_y.csv')
            pd.DataFrame(feature_matrix_test_enc).to_csv(
                'featuretools_features_test.csv')
Beispiel #29
0
        def auto_build(self, model_description):
            columns = []
            if isinstance(model_description["factors"], dict):
                factors_dict = model_description["factors"]
                for k, v in factors_dict.items():
                    columns.extend(v)
            else:
                columns.extend(model_description["factors"])

            # print("Data Columns: ", self.data.columns.tolist())
            # print("Extract Columns: ", columns)
            factors_df = self.data[columns]
            factors_df["customer_id"] = list(range(self.data.shape[0]))
            es = ft.EntitySet(id='customer_experience_entity')
            es = es.entity_from_dataframe(entity_id='c_id',
                                          dataframe=factors_df,
                                          index='customer_id')
            features, feature_names = ft.dfs(entityset=es,
                                             target_entity='c_id',
                                             max_depth=2,
                                             verbose=True)
            feature_matrix_enc, features_enc = ft.encode_features(
                features, feature_names)

            original_factors = set(feature_matrix_enc.columns.tolist())

            feature_matrix_enc = feature_matrix_enc.dropna(axis=1)

            after_naelimination = set(feature_matrix_enc.columns.tolist())

            print("Dropped columns with na: ",
                  list(original_factors - after_naelimination))

            feature_matrix_enc = feature_matrix_enc.loc[:, (
                feature_matrix_enc != 0).any(axis=0)]

            after_allzeros = set(feature_matrix_enc.columns.tolist())

            print("Dropped columns with all zeros: ",
                  after_naelimination - after_allzeros)

            # print(feature_matrix_enc.head())
            # print("Original Columns: ", columns)
            # print("Generated Columns: ", feature_matrix_enc.columns.tolist())
            corr_matrix = feature_matrix_enc.corr()
            corr_matrix = corr_matrix.dropna(axis=1, how='all')
            corr_matrix = corr_matrix.dropna(axis=0, how='all')

            print(
                "Dropped columns with na in correlation matrix: ",
                list(after_naelimination - set(corr_matrix.columns.tolist())))
            feature_matrix_enc = feature_matrix_enc[
                corr_matrix.columns.tolist()]

            for it in range(10):
                willdropped = set([])
                corr_matrix = feature_matrix_enc.corr()
                cols = corr_matrix.columns.tolist()
                for i in range(len(cols)):
                    row = cols[i]
                    if row in willdropped:
                        pass
                    for j in range(i + 1, len(cols)):
                        col = cols[j]
                        if col in willdropped:
                            pass
                        val = corr_matrix[row][col]
                        if np.abs(val) > 0.95:
                            print("{} , {} = {}".format(row, col, val))
                            willdropped.add(col)
                if len(list(willdropped)) == 0:
                    break
                print("Iteration: ", it + 1,
                      " Highly correlated columns have been dropped!: ",
                      list(willdropped))
                feature_matrix_enc = feature_matrix_enc.drop(
                    columns=list(willdropped))

            correlation_matrix = feature_matrix_enc.corr()
            covariance_matrix = feature_matrix_enc.cov()
            cond_number = np.linalg.cond(correlation_matrix.values)
            print("Condition number: {}".format(cond_number))

            copy_model = copy.deepcopy(model_description)
            current_columns = feature_matrix_enc.columns.tolist()

            def replace_marks(s):
                s = s.replace("=", "equals")
                s = s.replace(".", "dot")
                s = s.replace(",", "comma")
                return s

            current_columns = [
                "_".join(replace_marks(c).split(" ")) for c in current_columns
            ]
            feature_matrix_enc.columns = current_columns
            print("Cols: ", current_columns)

            if isinstance(copy_model["factors"], dict):
                factors_dict = copy_model["factors"]
                new_factors_dict = {}
                for k, v in factors_dict.items():
                    newv = []
                    for c in v:
                        replace = list(
                            filter(
                                lambda x: x.startswith("_".join(
                                    replace_marks(c).split(" "))),
                                current_columns))
                        newv.extend(replace)
                    if len(newv) > 0:
                        new_factors_dict[k] = newv
                    else:
                        raise Exception(
                            "Latent variable {} has been dropped! Rearrange your initial model description."
                            .format(k))
                copy_model["factors"] = new_factors_dict
            else:
                newv = []
                for c in copy_model["factors"]:
                    replace = list(
                        filter(
                            lambda x: x.startswith("_".join(
                                replace_marks(c).split(" "))),
                            current_columns))
                    newv.extend(replace)
                if len(newv) > 0:
                    copy_model["factors"] = newv
                else:
                    raise Exception(
                        "All loading factors have been dropped! Rearrange your initial model description."
                    )

            others = []

            others.extend(copy_model["observations"])
            copy_model["observations"] = [
                "_".join(replace_marks(c).split(" "))
                for c in copy_model["observations"]
            ]
            if isinstance(copy_model["kpis"], dict):
                kpis_dict = copy_model["kpis"]
                for k, v in kpis_dict.items():
                    others.extend(v)
                    copy_model["kpis"][k] = [
                        "_".join(replace_marks(c).split(" ")) for c in v
                    ]
            else:
                others.extend(copy_model["kpis"])
                copy_model["kpis"] = [
                    "_".join(replace_marks(c).split(" "))
                    for c in copy_model["kpis"]
                ]

            feature_matrix_enc = feature_matrix_enc.reset_index(
                inplace=False).drop("customer_id", axis=1)

            others_df = self.data[others]
            current_columns = [
                "_".join(replace_marks(c).split(" "))
                for c in others_df.columns
            ]
            others_df.columns = current_columns

            feature_matrix_enc = pd.concat([feature_matrix_enc, others_df],
                                           axis=1)
            feature_matrix_enc.to_csv("/tmp/autodata.csv",
                                      sep="\t",
                                      index=False)
            print(feature_matrix_enc.head())

            model = sem.build_model(copy_model, "auto_model")
            result = sem.fit_model("/tmp/autodata.csv",
                                   model,
                                   "auto_model",
                                   verbose="FALSE")
            return result
Beispiel #30
0
def extraction(entity: str,
               action_type: List[int],
               name_to_save: str,
               interesting_value: dict,
               agg_pre: list,
               depth: int,
               variable_type: dict = None,
               drop_list: list = [],
               sub_entity_list: list = [],
               trans_pre: list = []):
    log_df = get_train_log(None)
    # choose action type which used
    log_df = log_df.loc[log_df['action_type'].isin(action_type)]

    # choose logs by entity used
    log_df = choose_logs_in_train_and_test(log_df, entity=entity)
    log_df = log_df.reset_index(drop=True)
    log_df["index"] = log_df.index  # required by featuretools

    log_df["month"] = log_df["time_stamp"].map(lambda x: int(x / 100))
    log_df['data'] = log_df["time_stamp"].map(
        lambda x: '2016-' + str(int(x / 100)) + '-' + str(int(x // 100)))
    user_df = get_user_info()
    log_df = log_df.merge(user_df, on="user_id", how="inner")
    log_df["before_pro"] = log_df["time_stamp"].map(lambda x: (1101 < x) and
                                                    (x < 1111))

    # drop useless column
    log_df.drop(labels=drop_list, axis=1, inplace=True)

    es = ft.EntitySet(id="logs")

    # select feature column
    if entity == "user_id":
        log_df.drop(labels=["gender", "age_range"], axis=1, inplace=True)
        es = es.entity_from_dataframe(
            entity_id="logs",
            dataframe=log_df,
            index="index",
            time_index="data",
            variable_types=variable_type if variable_type is not None else {
                "user_id": ft.variable_types.Categorical,
                "item_id": ft.variable_types.Categorical,
                "cat_id": ft.variable_types.Categorical,
                "seller_id": ft.variable_types.Categorical,
                "brand_id": ft.variable_types.Categorical,
                "month": ft.variable_types.Categorical,
                "time_stamp": ft.variable_types.Categorical,
                "data": ft.variable_types.Datetime,
                'action_type': ft.variable_types.Categorical,
                "before_pro": ft.variable_types.Boolean,
            })
        es = es.normalize_entity(base_entity_id="logs",
                                 new_entity_id="user_id",
                                 index="user_id")
        es = es.normalize_entity(base_entity_id="logs",
                                 new_entity_id="seller_id",
                                 index="seller_id")
    elif entity == "user_seller":
        log_df["user_seller"] = np.add(
            np.array(log_df["user_id"].map(lambda x: str(x) + "_")),
            np.array(log_df["seller_id"].map(lambda x: str(x))))
        log_df.drop(labels=['user_id', 'seller_id', 'age_range', 'gender'],
                    axis=1,
                    inplace=True)
        es = es.entity_from_dataframe(
            entity_id="logs",
            dataframe=log_df,
            index="index",
            time_index="data",
            variable_types=variable_type if variable_type is not None else {
                "item_id": ft.variable_types.Categorical,
                "cat_id": ft.variable_types.Categorical,
                "brand_id": ft.variable_types.Categorical,
                "data": ft.variable_types.Datetime,
                "user_seller": ft.variable_types.Categorical,
                "time_stamp": ft.variable_types.Categorical,
                'action_type': ft.variable_types.Categorical,
                "month": ft.variable_types.Categorical,
                "before_pro": ft.variable_types.Boolean
            })
        es = es.normalize_entity(base_entity_id="logs",
                                 new_entity_id="user_seller",
                                 index="user_seller")
    elif entity == "seller_id":
        es = es.entity_from_dataframe(
            entity_id="logs",
            dataframe=log_df,
            index="index",
            time_index="data",
            variable_types=variable_type if variable_type is not None else {
                "user_id": ft.variable_types.Categorical,
                "item_id": ft.variable_types.Categorical,
                "cat_id": ft.variable_types.Categorical,
                "seller_id": ft.variable_types.Categorical,
                "brand_id": ft.variable_types.Categorical,
                "time_stamp": ft.variable_types.Categorical,
                "month": ft.variable_types.Categorical,
                'action_type': ft.variable_types.Categorical,
                "before_pro": ft.variable_types.Boolean,
                "age_range": ft.variable_types.Categorical,
                "gender": ft.variable_types.Categorical
            })
        if "user_id" not in drop_list:
            es = es.normalize_entity(base_entity_id="logs",
                                     new_entity_id="user_id",
                                     index="user_id")
        es = es.normalize_entity(base_entity_id="logs",
                                 new_entity_id="seller_id",
                                 index="seller_id")

    for key in interesting_value.keys():
        es["logs"][key].interesting_values = interesting_value[key]

    for sub_entity in sub_entity_list:
        es = es.normalize_entity(base_entity_id="logs",
                                 new_entity_id=sub_entity,
                                 index=sub_entity)

    print("start")
    feature_defs = ft.dfs(entityset=es,
                          target_entity=entity,
                          agg_primitives=agg_pre,
                          max_depth=depth,
                          where_primitives=agg_pre,
                          trans_primitives=trans_pre,
                          features_only=True)
    print(feature_defs)

    feature_matrix, feature_defs = ft.dfs(entityset=es,
                                          target_entity=entity,
                                          agg_primitives=agg_pre,
                                          max_depth=depth,
                                          where_primitives=agg_pre,
                                          trans_primitives=trans_pre,
                                          n_jobs=1,
                                          verbose=True)

    print(feature_defs)
    feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)
    print(features_enc)
    feature_matrix_enc.to_csv(os.path.join(get_root_path(), "feature_vectors",
                                           name_to_save),
                              float_format='%.4f',
                              index_label="index")