Example #1
0
def run():

    print("============================================")
    print("Testing Multi Table Pipeline")
    print("============================================")

    orders = pd.read_csv("data/Retail/orders.csv")
    order_products = pd.read_csv("data/Retail/order_products.csv")
    label_times = pd.read_csv("data/Retail/label_times.csv")

    X_train = label_times.sample(frac=0.8)
    X_test = label_times.drop(X_train.index)
    y_train = X_train["label"]
    y_test = X_test["label"]

    entity_set = make_entity_set(orders, order_products)

    multitable = MLPipeline(['dfs', 'random_forest_classifier'])

    updated_hyperparam = MLHyperparam('max_depth', 'int', [1, 10])
    updated_hyperparam.block_name = 'dfs'
    # multitable.update_tunable_hyperparams([updated_hyperparam])

    # Check that the hyperparameters are correct.
    for hyperparam in multitable.get_tunable_hyperparams():
        print(hyperparam)

    # Check that the blocks are correct.
    expected_blocks = {'dfs', 'rf_classifier'}
    blocks = set(multitable.blocks.keys())
    assert expected_blocks == blocks

    # Check that we can score properly.
    produce_params = {
        ('dfs', 'entityset'): entity_set,
        ('dfs', 'cutoff_time_in_index'): True
    }
    print("\nFitting pipeline...")
    fit_params = {
        ('dfs', 'entityset'): entity_set,
        ('dfs', 'target_entity'): "users",
        ('dfs', 'training_window'): ft.Timedelta("60 days")
    }
    multitable.fit(X_train,
                   y_train,
                   fit_params=fit_params,
                   produce_params=produce_params)
    print("\nFit pipeline.")

    print("\nScoring pipeline...")
    predicted_y_val = multitable.predict(X_test, predict_params=produce_params)
    score = f1_score(predicted_y_val, y_test, average='micro')
    print("\nf1 micro score: %f" % score)

    return score
    def __init__(self,
                 customer_entity,
                 customer_entity_index,
                 training_window_unit,
                 training_window,
                 agg_primitives=None,
                 trans_primitives=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 n_jobs=1,
                 chunk_size=0.1,
                 drop_contains=None,
                 drop_exact=None,
                 entity_set_drop_index_list=[],
                 auto_max_values=None,
                 manual_interesting_values_info=None,
                 where_primitives=None,
                 default_time_col_name='time',
                 str_id_col_threshold=0.9):
        self.customer_entity = customer_entity
        self.customer_entity_index = customer_entity_index
        self.agg_primitives = agg_primitives
        self.trans_primitives = trans_primitives
        self.ignore_entities = ignore_entities
        self.ignore_variables = ignore_variables
        self.training_window_unit = training_window_unit
        self.training_window = training_window
        self.n_jobs = n_jobs
        self.chunk_size = chunk_size
        self.drop_contains = drop_contains
        self.drop_exact = drop_exact
        self.entity_set_drop_index_list = entity_set_drop_index_list
        self.auto_max_values = auto_max_values
        self.manual_interesting_values_info = manual_interesting_values_info
        self.where_primitives = where_primitives
        self.default_time_col_name = default_time_col_name

        self.str_id_col_threshold = str_id_col_threshold

        self.feature_engineering_class = self.get_feature_engineering_class()

        # 进一步处理
        self.training_window = ft.Timedelta(
            self.training_window,
            unit=self._timedelta_mapper[self.training_window_unit])

        if self.drop_contains is None:
            self.drop_contains = entity_set_drop_index_list
        else:
            self.drop_contains += entity_set_drop_index_list
Example #3
0
def test_serialization(es):
    primitives_deserializer = PrimitivesDeserializer()
    value = ft.IdentityFeature(es['log']['value'])
    primitive = ft.primitives.Max()
    max1 = ft.AggregationFeature(value, es['customers'], primitive)

    path = next(es.find_backward_paths('customers', 'log'))
    dictionary = {
        'name': None,
        'base_features': [value.unique_name()],
        'relationship_path': [r.to_dictionary() for r in path],
        'primitive': serialize_primitive(primitive),
        'where': None,
        'use_previous': None,
    }

    assert dictionary == max1.get_arguments()
    assert max1 == \
        ft.AggregationFeature.from_dictionary(dictionary, es,
                                              {value.unique_name(): value},
                                              primitives_deserializer)

    is_purchased = ft.IdentityFeature(es['log']['purchased'])
    use_previous = ft.Timedelta(3, 'd')
    max2 = ft.AggregationFeature(value,
                                 es['customers'],
                                 primitive,
                                 where=is_purchased,
                                 use_previous=use_previous)

    dictionary = {
        'name': None,
        'base_features': [value.unique_name()],
        'relationship_path': [r.to_dictionary() for r in path],
        'primitive': serialize_primitive(primitive),
        'where': is_purchased.unique_name(),
        'use_previous': use_previous.get_arguments(),
    }

    assert dictionary == max2.get_arguments()
    dependencies = {
        value.unique_name(): value,
        is_purchased.unique_name(): is_purchased
    }
    assert max2 == \
        ft.AggregationFeature.from_dictionary(dictionary, es, dependencies,
                                              primitives_deserializer)
Example #4
0
def test_serialization(es):
    primitives_deserializer = PrimitivesDeserializer()
    value = ft.IdentityFeature(es["log"].ww["value"])
    primitive = ft.primitives.Max()
    max1 = ft.AggregationFeature(value, "customers", primitive)

    path = next(es.find_backward_paths("customers", "log"))
    dictionary = {
        "name": None,
        "base_features": [value.unique_name()],
        "relationship_path": [r.to_dictionary() for r in path],
        "primitive": serialize_primitive(primitive),
        "where": None,
        "use_previous": None,
    }

    assert dictionary == max1.get_arguments()
    deserialized = ft.AggregationFeature.from_dictionary(
        dictionary, es, {value.unique_name(): value}, primitives_deserializer
    )
    _assert_agg_feats_equal(max1, deserialized)

    is_purchased = ft.IdentityFeature(es["log"].ww["purchased"])
    use_previous = ft.Timedelta(3, "d")
    max2 = ft.AggregationFeature(
        value, "customers", primitive, where=is_purchased, use_previous=use_previous
    )

    dictionary = {
        "name": None,
        "base_features": [value.unique_name()],
        "relationship_path": [r.to_dictionary() for r in path],
        "primitive": serialize_primitive(primitive),
        "where": is_purchased.unique_name(),
        "use_previous": use_previous.get_arguments(),
    }

    assert dictionary == max2.get_arguments()
    dependencies = {
        value.unique_name(): value,
        is_purchased.unique_name(): is_purchased,
    }
    deserialized = ft.AggregationFeature.from_dictionary(
        dictionary, es, dependencies, primitives_deserializer
    )
    _assert_agg_feats_equal(max2, deserialized)
Example #5
0
def test_serialization(es):
    primitives_deserializer = PrimitivesDeserializer()
    value = ft.IdentityFeature(es['log']['value'])
    primitive = ft.primitives.Max()
    max1 = ft.AggregationFeature(value, es['sessions'], primitive)

    dictionary = {
        'base_features': [value.unique_name()],
        'parent_entity_id': 'sessions',
        'primitive': serialize_primitive(primitive),
        'where': None,
        'use_previous': None,
    }

    assert dictionary == max1.get_arguments()
    assert max1 == \
        ft.AggregationFeature.from_dictionary(dictionary, es,
                                              {value.unique_name(): value},
                                              primitives_deserializer)

    is_purchased = ft.IdentityFeature(es['log']['purchased'])
    use_previous = ft.Timedelta(3, 'd')
    max2 = ft.AggregationFeature(value,
                                 es['sessions'],
                                 primitive,
                                 where=is_purchased,
                                 use_previous=use_previous)

    dictionary = {
        'base_features': [value.unique_name()],
        'parent_entity_id': 'sessions',
        'primitive': serialize_primitive(primitive),
        'where': is_purchased.unique_name(),
        'use_previous': use_previous.get_arguments(),
    }

    assert dictionary == max2.get_arguments()
    dependencies = {
        value.unique_name(): value,
        is_purchased.unique_name(): is_purchased
    }
    assert max2 == \
        ft.AggregationFeature.from_dictionary(dictionary, es, dependencies,
                                              primitives_deserializer)
def main():
    logger = logging.getLogger(__name__)
    logger.info('creating a bunch of features')

    pbar = ProgressBar()
    pbar.register()

    target_entities = ['ip', 'app', 'device', 'os', 'channel']
    filenames_train = sorted(glob('../data/interim/train_2017-11-*00.csv'))
    training_windows = ['1 hours', '3 hours', '1 day']

    for target_entity in target_entities:
        filenames = glob(
            f"../data/interim/partitioned/{target_entity}/train_*.csv")
        b = bag.from_sequence(filenames)
        entity_sets = b.map(create_entityset, target_entity).compute()
        gc.collect()

        for filename in filenames_train:
            logger.info(f"Processing: {filename}")
            df = pd.read_csv(filename,
                             usecols=['click_time'],
                             parse_dates=to_parse)
            cutoff_time = df['click_time'].min()
            del df
            for training_window in training_windows:
                create_features(filename,
                                entity_sets,
                                target_entity=target_entity,
                                cutoff_time=cutoff_time,
                                training_window=ft.Timedelta(training_window))

        del entity_sets, b
        gc.collect()

    logger.info('finished')
    def dfsWindow(self,
                  target_entity,
                  time_scope=None,
                  training_window=None,
                  cutoff_times=None,
                  max_depth=1,
                  chunk_size=None,
                  n_jobs=1):
        '''Runs dfs on the target_entity and outputs a feature matrix with 
        features based on the training_window and time_scope relative to cutoff 
        times. If no training_window, time_scope, or cutoff_times are specified,
        regular dfs will run without using cutoff times.
           
        target_entity: str. Name of target_entity in entity set to run dfs on. 
        The index of the target_entity must match the instance_id column in the 
        cutoff_times table.
           
        time_scope: 'daily', 'weekly' or 'monthly'. Assumes 7 days in a week, 
        and 31 days in a month.
           
       training_window: list of integers that refer to the number of months or 
       weeks depending on the time_scope. Ex. [1, 2] for time_scope='monthly' 
       returns features based on the last month and last 2 months from the 
       cutoff date.
       
       cutoff_times: Pandas dataframe with instance_id, cutoff_dates, and 
       label (label is optional). Any columns after instance_id and cutoff_dates 
       will not be used for feature synthesis. The instance_id column must match 
       the index of the target entity. 
       
       max_depth: integer, defines how many levels of dfs to run. For example if 
       max_depth = 2 on a transactions table, features returned include avg. 
       transactions and avg. of avg. transactions.
       
       chunk_size: integer, float, None, or "cutoff time". Number of rows of 
       output feature matrix to calculate at time. If passed an integer greater 
       than 0, it will use that many rows per chunk. If passed a float value 
       between 0 and 1, sets the chunk size to that percentage of all instances. 
       If passed “cutoff time”, rows are split per cutoff time.
       
       n_jobs: integer. The number of parallel processes to use when creating
       the feature matrix.
        '''
        orig_window = training_window
        if (time_scope is None) or (training_window is None) or (cutoff_times
                                                                 is None):
            self.df, feature_defs = ft.dfs(
                entityset=self.es,
                target_entity=target_entity,
                agg_primitives=self.agg_primitives,
                trans_primitives=self.trans_primitives,
                where_primitives=self.where_primitives,
                max_depth=max_depth,
                features_only=False,
                verbose=1,
                chunk_size=chunk_size,
                n_jobs=n_jobs)

        else:
            self.df, feature_defs = ft.dfs(
                entityset=self.es,
                target_entity=target_entity,
                cutoff_time=cutoff_times,
                agg_primitives=self.agg_primitives,
                trans_primitives=self.trans_primitives,
                where_primitives=self.where_primitives,
                max_depth=max_depth,
                features_only=False,
                verbose=1,
                chunk_size=chunk_size,
                n_jobs=n_jobs,
                cutoff_time_in_index=True)
            if time_scope == 'daily':
                training_window = [int(x) for x in orig_window]
                for i in range(len(training_window)):
                    feature_matrix = ft.calculate_feature_matrix(
                        entityset=self.es,
                        features=feature_defs,
                        cutoff_time=cutoff_times,
                        chunk_size=chunk_size,
                        cutoff_time_in_index=True,
                        n_jobs=n_jobs,
                        training_window=ft.Timedelta(training_window[i], "d"))

                    suffix = '_' + str(orig_window[i]) + 'day'
                    feature_matrix = feature_matrix.add_suffix(suffix)
                    self.df = pd.concat([self.df, feature_matrix],
                                        axis=1,
                                        join='inner')

            elif time_scope == 'monthly':
                training_window = [x * 30 for x in orig_window]
                for i in range(len(training_window)):
                    feature_matrix = ft.calculate_feature_matrix(
                        entityset=self.es,
                        features=feature_defs,
                        cutoff_time=cutoff_times,
                        chunk_size=chunk_size,
                        cutoff_time_in_index=True,
                        n_jobs=n_jobs,
                        training_window=ft.Timedelta(training_window[i], "d"))

                    suffix = '_' + str(orig_window[i]) + 'mos'
                    feature_matrix = feature_matrix.add_suffix(suffix)
                    self.df = pd.concat([self.df, feature_matrix],
                                        axis=1,
                                        join='inner')

            elif time_scope == 'weekly':
                training_window = [x * 7 for x in orig_window]
                for i in range(len(training_window)):
                    feature_matrix, feature_defs = ft.dfs(
                        entityset=self.es,
                        target_entity=target_entity,
                        cutoff_time=cutoff_times,
                        agg_primitives=self.agg_primitives,
                        trans_primitives=self.trans_primitives,
                        where_primitives=self.where_primitives,
                        max_depth=max_depth,
                        features_only=False,
                        verbose=1,
                        chunk_size=chunk_size,
                        cutoff_time_in_index=True,
                        n_jobs=n_jobs,
                        training_window=ft.Timedelta(training_window[i], "d"))

                    suffix = '_' + str(orig_window[i]) + 'wks'
                    feature_matrix = feature_matrix.add_suffix(suffix)
                    self.df = pd.concat([self.df, feature_matrix],
                                        axis=1,
                                        join='inner')

            else:
                print("ERROR: time_scope entered is not one of the options.")

        drop_duplicates = DropDuplicate()
        self.df = drop_duplicates.fit_transform(self.df)

        for i in self.df.columns:
            self.feature_defs.append(i)

        return self.df
Example #8
0
import featuretools as ft
import pandas as pd
import utils, os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

es = utils.load_entityset("./featuretools_part_1/")
print(es)
label_times = utils.make_labels(es=es,
                                product_name="Banana",
                                cutoff_time=pd.Timestamp('March 15, 2015'),
                                prediction_window=ft.Timedelta("4 weeks"),
                                training_window=ft.Timedelta("60 days"))

feature_matrix, features = ft.dfs(
    target_entity="users",
    cutoff_time=label_times,
    training_window=ft.Timedelta("60 days"),  # same as above
    entityset=es,
    verbose=True)

# Encode categorical values
fm_encoded, features_encoded = ft.encode_features(feature_matrix, features)

print("Number of features %s" % len(features_encoded))
print(features_encoded)

# Sample the feature by user input

# Train the classifier