Beispiel #1
0
def pickle_features_test_helper(es_size, features_original, dir_path):
    filepath = os.path.join(dir_path, "test_feature")

    ft.save_features(features_original, filepath)
    features_deserializedA = ft.load_features(filepath)
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    with open(filepath, "w") as f:
        ft.save_features(features_original, f)
    features_deserializedB = ft.load_features(open(filepath))
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    features = ft.save_features(features_original)
    features_deserializedC = ft.load_features(features)
    assert asizeof(features) < es_size

    features_deserialized_options = [
        features_deserializedA,
        features_deserializedB,
        features_deserializedC,
    ]
    for features_deserialized in features_deserialized_options:
        assert_features(features_original, features_deserialized)
def pickle_features_test_helper(es_size, features_original):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserializedA = ft.load_features(filepath)
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    with open(filepath, "w") as f:
        ft.save_features(features_original, f)
    features_deserializedB = ft.load_features(open(filepath))
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    features = ft.save_features(features_original)
    features_deserializedC = ft.load_features(features)
    assert asizeof(features) < es_size

    features_deserialized_options = [
        features_deserializedA, features_deserializedB, features_deserializedC
    ]
    for features_deserialized in features_deserialized_options:
        for feat_1, feat_2 in zip(features_original, features_deserialized):
            assert feat_1.unique_name() == feat_2.unique_name()
            assert feat_1.entityset == feat_2.entityset
def test_pickle_features(es):
    features_no_pickle = ft.dfs(target_entity='sessions',
                                entityset=es,
                                features_only=True)

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features_with_custom_primitive(es):
    NewMax = make_agg_primitive(
        lambda x: max(x),
        name="NewMax",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")

    features_no_pickle = ft.dfs(target_entity='sessions',
                                entityset=es,
                                agg_primitives=["Last", "Mean", NewMax],
                                features_only=True)

    assert any(
        [isinstance(feat.primitive, NewMax) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(features_no_pickle[0].entityset, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath, es)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(feat_1.entityset)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
Beispiel #6
0
def test_deserializer_uses_common_primitive_instances_no_args(es, tmp_path):
    features = ft.dfs(
        entityset=es,
        target_dataframe_name="products",
        features_only=True,
        agg_primitives=["sum"],
        trans_primitives=["is_null"],
    )

    is_null_features = [f for f in features if f.primitive.name == "is_null"]
    sum_features = [f for f in features if f.primitive.name == "sum"]

    # Make sure we have multiple features of each type
    assert len(is_null_features) > 1
    assert len(sum_features) > 1

    # DFS should use the same primitive instance for all features that share a primitive
    is_null_primitive = is_null_features[0].primitive
    sum_primitive = sum_features[0].primitive
    assert all([f.primitive is is_null_primitive for f in is_null_features])
    assert all([f.primitive is sum_primitive for f in sum_features])

    file = os.path.join(tmp_path, "features.json")
    ft.save_features(features, file)
    deserialized_features = ft.load_features(file)
    new_is_null_features = [
        f for f in deserialized_features if f.primitive.name == "is_null"
    ]
    new_sum_features = [f for f in deserialized_features if f.primitive.name == "sum"]

    # After deserialization all features that share a primitive should use the same primitive instance
    new_is_null_primitive = new_is_null_features[0].primitive
    new_sum_primitive = new_sum_features[0].primitive
    assert all([f.primitive is new_is_null_primitive for f in new_is_null_features])
    assert all([f.primitive is new_sum_primitive for f in new_sum_features])
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
Beispiel #8
0
def test_deserialize_features_s3(pd_es, url, profile_name):
    agg_primitives = [
        Sum,
        Std,
        Max,
        Skew,
        Min,
        Mean,
        Count,
        PercentTrue,
        NumUnique,
        Mode,
    ]

    trans_primitives = [Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters]

    features_original = ft.dfs(
        target_dataframe_name="sessions",
        entityset=pd_es,
        features_only=True,
        agg_primitives=agg_primitives,
        trans_primitives=trans_primitives,
    )
    features_deserialized = ft.load_features(url, profile_name=profile_name)
    assert_features(features_original, features_deserialized)
Beispiel #9
0
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_feature_serialization(universal_sentence_encoder, tmpdir):
    sentences = pd.Series([
        "",
        "I like to eat pizza",
        "The roller coaster was built in 1885.",
        "When will humans go to mars?",
        "Mitochondria is the powerhouse of the cell",
    ])

    es = ft.EntitySet("es")
    df = pd.DataFrame({"id": [0, 1, 2, 3, 4], "sentences": sentences})
    es.add_dataframe(
        dataframe=df,
        dataframe_name="dataframe",
        index="id",
        logical_types={"sentences": NaturalLanguage},
    )
    fm, features = ft.dfs(
        entityset=es,
        target_dataframe_name="dataframe",
        trans_primitives=[universal_sentence_encoder],
    )

    filename = str(tmpdir.join("features.txt"))
    ft.save_features(features, filename)
    loaded_features = ft.load_features(filename)
    fm_serialized = ft.calculate_feature_matrix(loaded_features, entityset=es)

    pd.testing.assert_frame_equal(fm, fm_serialized)
Beispiel #11
0
    def test_serialize(self, es):
        features = dfs(
            entityset=es,
            target_dataframe_name="log",
            trans_primitives=[self.primitive],
            max_features=-1,
            max_depth=3,
            features_only=True,
        )

        feat_to_serialize = None
        for feature in features:
            if feature.primitive.__class__ == self.primitive:
                feat_to_serialize = feature
                break
            for base_feature in feature.get_dependencies(deep=True):
                if base_feature.primitive.__class__ == self.primitive:
                    feat_to_serialize = base_feature
                    break
        assert feat_to_serialize is not None

        # Skip calculating feature matrix for long running primitives
        skip_primitives = ["elmo"]

        if self.primitive.name not in skip_primitives:
            df1 = calculate_feature_matrix([feat_to_serialize], entityset=es)

        new_feat = load_features(save_features([feat_to_serialize]))[0]
        assert isinstance(new_feat, ft.FeatureBase)

        if self.primitive.name not in skip_primitives:
            df2 = calculate_feature_matrix([new_feat], entityset=es)
            assert df1.equals(df2)
def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
 def _load_features(features, profile_name=None):
     """
     :param features: str 或文件对象,  特征所在的位置
     :param profile_name: str bool 型
     :return:   list
     """
     feature = ft.load_features(features=features,
                                profile_name=profile_name)
     return feature
Beispiel #14
0
 def load_features_create(model_path):
     '''
     you can load features_enc;
     Example
     ----------
     1.features = load_features_create('feature_definitions.json')
     2.feature_matrix = make_features(features)
     '''
     return ft.load_features(model_path)
def test_deserialize_features_s3(es, url, profile_name):
    features_original = sorted(ft.dfs(target_entity='sessions',
                                      entityset=es,
                                      features_only=True),
                               key=lambda x: x.unique_name())
    features_deserialized = sorted(ft.load_features(url,
                                                    profile_name=profile_name),
                                   key=lambda x: x.unique_name())
    assert_features(features_original, features_deserialized)
Beispiel #16
0
    def select_features(self):
        all_features = ft.load_features(
            os.path.join(self.path, self.feature_names_file))
        features_from_selector = list(
            pd.read_csv(self.file_name_with_selected_features,
                        sep=self.sep).columns)

        for feature in all_features:
            if feature.get_name() in features_from_selector:
                self.selected_features.append(feature)
def test_s3_test_profile(es, s3_client, s3_bucket, setup_test_profile):
    features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True)

    ft.save_features(features_original, TEST_S3_URL, profile_name='test')

    obj = list(s3_bucket.objects.all())[0].key
    s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL='public-read-write')

    features_deserialized = ft.load_features(TEST_S3_URL, profile_name='test')
    assert_features(features_original, features_deserialized)
Beispiel #18
0
def test_serialize_features_mock_anon_s3(es, s3_client, s3_bucket):
    features_original = ft.dfs(
        target_dataframe_name="sessions", entityset=es, features_only=True
    )

    ft.save_features(features_original, TEST_S3_URL, profile_name=False)

    obj = list(s3_bucket.objects.all())[0].key
    s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL="public-read-write")

    features_deserialized = ft.load_features(TEST_S3_URL, profile_name=False)
    assert_features(features_original, features_deserialized)
Beispiel #19
0
def construct_retail_example(ftens_file='retail_binary_files/ftens.csv',
                             labels_file='retail_binary_files/labels.csv',
                             fl_file='retail_binary_files/fl.p'):
    es = ft.demo.load_retail()
    if os.path.exists(ftens_file):
        ftens = pd.read_csv(ftens_file,
                            index_col=['customer_id', 'time'],
                            parse_dates=['time'])
        labels = pd.read_csv(labels_file, index_col='customer_id')['label']
        fl = ft.load_features(fl_file, es)
    else:
        labels = create_labels(es,
                               min_training_data='8 days',
                               lead='7 days',
                               window='30 days',
                               reduce='sum',
                               binarize=None,
                               iterate_by=None)
        labels_binary = labels.copy()
        labels_binary['label'] = labels_binary['label'] > 300
        sampled = sample_labels(labels_binary, n=1)
        sampled = sampled[['customer_id', 'time', 'label']]
        sampled = sampled.sample(300)

        ftens, fl = ft.tdfs(target_entity='customers',
                            entityset=es,
                            cutoffs=sampled,
                            window_size='30d',
                            num_windows=5,
                            verbose=True)

        ftens = (ftens.reset_index(
            'customer_id', drop=False).reset_index(drop=False).merge(
                sampled[['customer_id', 'label']],
                on='customer_id',
                how='left').set_index('customer_id').set_index('time',
                                                               append=True))

        labels = (ftens['label'].reset_index(
            'customer_id',
            drop=False).drop_duplicates('customer_id').set_index('customer_id')
                  )
        del ftens['label']
        ftens.to_csv(ftens_file)
        labels.to_csv(labels_file)
        labels = labels['label']
        ft.save_features(fl, fl_file)
    return ftens, labels, fl
def test_pickle_features(es):
    features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True)

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserialized = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_original, features_deserialized):
        assert feat_1.unique_name() == feat_2.unique_name()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    os.remove(filepath)
def feature_matrix_from_entity_set(es: ft.EntitySet, dp: str, mt: str) -> None:
    """
    计算特征矩阵,并保存到指定目录

    :param es: 实体集
    :param dp: 路径
    :param mt: 主表
    :return: 无
    """
    feature_defs = Data_Val.feature_defs
    feature_defs = ft.load_features(open(feature_defs, 'rb'))
    feature_matrix = ft.calculate_feature_matrix(feature_defs,
                                                 entityset=es,
                                                 n_jobs=1,
                                                 verbose=0)
    feature_matrix.to_csv(os.path.join(dp, 'p.csv'), index=True)
Beispiel #22
0
def load(entity_set_folder_name: str,
         features_file_name: str,
         verbose: bool = True) -> pd.DataFrame:
    """
    Load dataframe from featuretools params.
    
    Args:
        entity_set_folder_name: Entity set folder path.
        features_file_name: Features file path.
        verbose: Verbosity,

    Returns:
        Dataframe.
    """
    es = ft.read_entityset(entity_set_folder_name)
    features = ft.load_features(features_file_name)
    return ft.calculate_feature_matrix(features, es, verbose=verbose)
Beispiel #23
0
def build_card_one_hot():
    """ Reads in the raw data from train.csv and creates
        one-hot encodings for the feature and date fields.

        :return: Data frame with one-hot encoding
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in data.")
    df = pd.read_csv('data/raw/train.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")

    logger.info("Creating entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=df,
                                              index='card_id',
                                              time_index="first_active_month",
                                              variable_types=CARD_TYPES)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity="transactions")

    logger.info("Creating one-hot training data")
    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Creating one-hot test data")
    df = pd.read_csv('data/raw/test.csv')
    df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01")
    df['target'] = 0
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=df,
                                            index='card_id',
                                            time_index="first_active_month",
                                            variable_types=CARD_TYPES)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)
    test_feature_matrix_enc.drop(columns='target', inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
Beispiel #24
0
def test_custom_feature_names_retained_during_serialization(pd_es, tmpdir):
    class MultiCumulative(TransformPrimitive):
        name = "multi_cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 3

    multi_output_trans_feat = ft.Feature(
        pd_es["log"].ww["value"], primitive=MultiCumulative
    )
    groupby_trans_feat = ft.GroupByTransformFeature(
        pd_es["log"].ww["value"],
        primitive=MultiCumulative,
        groupby=pd_es["log"].ww["product_id"],
    )
    multi_output_agg_feat = ft.Feature(
        pd_es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    slice = FeatureOutputSlice(multi_output_trans_feat, 1)
    stacked_feat = ft.Feature(slice, primitive=Negate)

    trans_names = ["cumulative_sum", "cumulative_max", "cumulative_min"]
    multi_output_trans_feat.set_feature_names(trans_names)
    groupby_trans_names = ["grouped_sum", "grouped_max", "grouped_min"]
    groupby_trans_feat.set_feature_names(groupby_trans_names)
    agg_names = ["first_most_common", "second_most_common"]
    multi_output_agg_feat.set_feature_names(agg_names)

    features = [
        multi_output_trans_feat,
        multi_output_agg_feat,
        groupby_trans_feat,
        stacked_feat,
    ]
    file = os.path.join(tmpdir, "features.json")
    ft.save_features(features, file)
    deserialized_features = ft.load_features(file)

    new_trans, new_agg, new_groupby, new_stacked = deserialized_features
    assert new_trans.get_feature_names() == trans_names
    assert new_agg.get_feature_names() == agg_names
    assert new_groupby.get_feature_names() == groupby_trans_names
    assert new_stacked.get_feature_names() == ["-(cumulative_max)"]
Beispiel #25
0
def test_deserialize_features_s3(es, url, profile_name):
    agg_primitives = [
        Sum, Std, Max, Skew, Min, Mean, Count, PercentTrue, NumUnique, Mode
    ]

    trans_primitives = [
        Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters
    ]

    features_original = sorted(ft.dfs(target_entity='sessions',
                                      entityset=es,
                                      features_only=True,
                                      agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives),
                               key=lambda x: x.unique_name())
    features_deserialized = sorted(ft.load_features(url,
                                                    profile_name=profile_name),
                                   key=lambda x: x.unique_name())
    assert_features(features_original, features_deserialized)
Beispiel #26
0
def get_test_data(project,
                  testfile,
                  prediction_key,
                  prediction_target,
                  variable_types={},
                  drop_columns=None):

    print("==========Reading test data file {}".format(testfile))
    test_data = pd.read_csv(testfile)
    print(test_data.describe())

    if drop_columns is not None:
        print("==========dropping columns {}".format(drop_columns))
        test_data = test_data.drop(drop_columns, axis=1)

    es = ft.EntitySet(project)

    entities = get_ft_entities(es=es,
                               project=project,
                               prediction_key=prediction_key,
                               data=test_data,
                               variable_types=variable_types)

    print("==========entities are:")
    print(entities)

    print("==========Reading features from {}".format(project))
    saved_features = ft.load_features("data/{}/ft_features".format(project))

    print("==========saved_features are:")
    print(saved_features)

    feature_matrix = ft.calculate_feature_matrix(saved_features, entities)

    feature_matrix_enc, _ = ft.encode_features(feature_matrix, saved_features)

    index_column = test_data[prediction_key]

    return feature_matrix_enc, index_column
Beispiel #27
0
def main(users_from, users_till):
    
    # ### DEFINE PIPELINE PARAMETERS

    # In[11]:

    load_to_database = False
    save_as_csv = False

    # the timeframe of extracted users
    # users_from = '2018-04-01'
    # users_till = '2018-04-30'

    # include all users in each of the cohorts
    cohort_size = 1000000000

    # the timeframe of extracted behavioral data
    interval = '3 weeks'

    # the type of the prediction problem
    # 'regression', 'binary classification', 'multiclass classification'
    prediction_problem_type = 'binary classification'

    print("Pipeline parameters defined")
    print("Extraction of scoring for users from", users_from, "till", users_till)


    # ### CONNECT TO THE DATABASE

    # In[12]:

    conn, cur = utils.connect_to_db()


    # ### BUILD ENTITIES

    # #### Cohorts entity

    # In[13]:

    cohorts = utils_bux.build_cohorts_entity(cur=cur,
                                             users_from=users_from,
                                             users_till=users_till)


    # #### Users entity

    # In[14]:

    users = utils_bux.build_users_entity(cur=cur,
                                                users_from=users_from,
                                                users_till=users_till,
                                                interval=interval,
                                                cohorts=cohorts,
                                                cohort_size=cohort_size)


    # #### Transactions entity

    # In[15]:

    transactions = utils_bux.build_transactions_entity(cur=cur,
                                                             interval=interval)


    # ### CREATE THE ENTITY SET

    # In[16]:

    es = utils_bux.create_bux_entity_set(cohorts, users, transactions)
    es


    # ### FEATURE ENGINEERING (DFS)

    # In[17]:

    top_features = ft.load_features("top_features", es)
    fm = utils.calculate_feature_matrix_top_features(es, top_features)
    X = fm.reset_index(drop=True).fillna(0)
    print("Features built:\n", list(fm.columns))


    # ### LOADING THE MODEL

    # In[18]:

    model = joblib.load('models/model.pkl')
    print("Model loaded")


    # ### SCORING

    # In[19]:

    y_pred = utils.rf_predict(model, X, prediction_problem_type)
    print("Prediction done")


    # In[20]:

    # save predictions in a csv
    predictions = pd.DataFrame()
    predictions["user_id"] = user_details["user_id"]
    predictions["topic_type"] = "clv_prediction"
    predictions['report_date'] = pd.to_datetime('today').strftime("%Y-%m-%d")
    predictions["model_type"] = "randomforest"
    predictions["class_prediction"] = y_pred
    predictions["prob"] = 0
    predictions = predictions[["topic_type", "report_date", "model_type", "user_id", "class_prediction", "prob"]]
    predictions.head()


    # ### SAVE AS CSV AND/OR LOAD RESULTS INTO THE THE DATABASE

    # In[21]:

    if save_as_csv:
        predictions.to_csv("scoring/results" + users_from + "-" + users_till, index=False)


    # In[22]:

    if load_to_database:
        utils_bux.copy_to_database(predictions, 'db_table_name', conn)
Beispiel #28
0
def build_transaction_data():
    """ Builds a data set from raw card and transaction data
        using the featuretools package.

        The resulting data set will be strictly concerned
        with transactions shown in the historical transactions CSV,
        and linking them to the proper card.

        :return:    training, testing feature matrices
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in card data")
    customer_df = pd.read_csv("data/raw/train.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    customer_df.drop(columns='target', inplace=True)

    logger.info("Reading in transactions")
    transactions_df = pd.read_csv("data/raw/historical_transactions.csv",
                                  dtype=TRANSACTION_LOAD_DTYPES)
    transactions_df['authorized_flag'] = np.where(
        transactions_df['authorized_flag'] == 'Y', 1, 0)
    transactions_df.reset_index(inplace=True)

    logger.info("Creating training entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='customer',
                                              dataframe=customer_df,
                                              index='card_id',
                                              time_index='first_active_month',
                                              variable_types=CARD_TYPES)

    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=transactions_df,
                                              index='index',
                                              variable_types=TRANSACTION_TYPES)

    del customer_df
    gc.collect()

    logger.info("Defining relationships")
    relationship = ft.Relationship(es_train['customer']['card_id'],
                                   es_train['transactions']['card_id'])

    es_train = es_train.add_relationship(relationship)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity='customer')

    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Loading test data")
    customer_df = pd.read_csv("data/raw/test.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    logger.info("Creating testing entity set")
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='customer',
                                            dataframe=customer_df,
                                            index='card_id',
                                            time_index='first_active_month',
                                            variable_types=CARD_TYPES)

    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=transactions_df,
                                            index='index',
                                            variable_types=TRANSACTION_TYPES)

    es_test = es_test.add_relationship(relationship)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)

    for col in train_feature_matrix_enc.columns:
        logger.debug(f"Normalizing feature [{col}]")
        old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max'])

        if (old_min == old_max):
            logger.debug(f"Droping feature [{col}] due to lack of variation")
            train_feature_matrix_enc.drop(columns=col, inplace=True)
            test_feature_matrix_enc.drop(columns=col, inplace=True)

            continue

        train_feature_matrix_enc[col] = normalize_series(
            series=train_feature_matrix_enc[col], min_max=(old_min, old_max))

        assert col in test_feature_matrix_enc.columns

        test_feature_matrix_enc[col] = normalize_series(
            series=test_feature_matrix_enc[col], min_max=(old_min, old_max))

    logger.info("Dropping SKEW features.")
    # TODO: Determine why these have lower counts than other features
    drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c]
    train_feature_matrix_enc.drop(columns=drop_cols, inplace=True)
    test_feature_matrix_enc.drop(columns=drop_cols, inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
Beispiel #29
0
def test_deserializer_uses_common_primitive_instances_with_args(es, tmp_path):
    # Single argument
    scalar1 = MultiplyNumericScalar(value=1)
    scalar5 = MultiplyNumericScalar(value=5)
    features = ft.dfs(
        entityset=es,
        target_dataframe_name="products",
        features_only=True,
        agg_primitives=["sum"],
        trans_primitives=[scalar1, scalar5],
    )

    scalar1_features = [
        f
        for f in features
        if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name()
    ]
    scalar5_features = [
        f
        for f in features
        if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name()
    ]

    # Make sure we have multiple features of each type
    assert len(scalar1_features) > 1
    assert len(scalar5_features) > 1

    # DFS should use the the passed in primitive instance for all features
    assert all([f.primitive is scalar1 for f in scalar1_features])
    assert all([f.primitive is scalar5 for f in scalar5_features])

    file = os.path.join(tmp_path, "features.json")
    ft.save_features(features, file)
    deserialized_features = ft.load_features(file)

    new_scalar1_features = [
        f
        for f in deserialized_features
        if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name()
    ]
    new_scalar5_features = [
        f
        for f in deserialized_features
        if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name()
    ]

    # After deserialization all features that share a primitive should use the same primitive instance
    new_scalar1_primitive = new_scalar1_features[0].primitive
    new_scalar5_primitive = new_scalar5_features[0].primitive
    assert all([f.primitive is new_scalar1_primitive for f in new_scalar1_features])
    assert all([f.primitive is new_scalar5_primitive for f in new_scalar5_features])
    assert new_scalar1_primitive.value == 1
    assert new_scalar5_primitive.value == 5

    # Test primitive with multiple args - pandas only due to primitive compatibility
    if es.dataframe_type == Library.PANDAS.value:
        distance_to_holiday = DistanceToHoliday(
            holiday="Victoria Day", country="Canada"
        )
        features = ft.dfs(
            entityset=es,
            target_dataframe_name="customers",
            features_only=True,
            agg_primitives=[],
            trans_primitives=[distance_to_holiday],
        )

        distance_features = [
            f for f in features if f.primitive.name == "distance_to_holiday"
        ]

        assert len(distance_features) > 1

        # DFS should use the the passed in primitive instance for all features
        assert all([f.primitive is distance_to_holiday for f in distance_features])

        file = os.path.join(tmp_path, "distance_features.json")
        ft.save_features(distance_features, file)
        new_distance_features = ft.load_features(file)

        # After deserialization all features that share a primitive should use the same primitive instance
        new_distance_primitive = new_distance_features[0].primitive
        assert all(
            [f.primitive is new_distance_primitive for f in new_distance_features]
        )
        assert new_distance_primitive.holiday == "Victoria Day"
        assert new_distance_primitive.country == "Canada"

    # Test primitive with list arg
    is_in = IsIn(list_of_outputs=[5, True, "coke zero"])
    features = ft.dfs(
        entityset=es,
        target_dataframe_name="customers",
        features_only=True,
        agg_primitives=[],
        trans_primitives=[is_in],
    )

    is_in_features = [f for f in features if f.primitive.name == "isin"]
    assert len(is_in_features) > 1

    # DFS should use the the passed in primitive instance for all features
    assert all([f.primitive is is_in for f in is_in_features])

    file = os.path.join(tmp_path, "distance_features.json")
    ft.save_features(is_in_features, file)
    new_is_in_features = ft.load_features(file)

    # After deserialization all features that share a primitive should use the same primitive instance
    new_is_in_primitive = new_is_in_features[0].primitive
    assert all([f.primitive is new_is_in_primitive for f in new_is_in_features])
    assert new_is_in_primitive.list_of_outputs == [5, True, "coke zero"]
Beispiel #30
0
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

import json

# featuretools for automated feature engineering
import featuretools as ft
import featuretools.variable_types as vtypes


featurenames = ft.load_features('../input/features.txt')
print('Number of features: {}'.format(len(featurenames)))

print('Reading in data')
# Read in the datasets and replace the anomalous values
app_train = pd.read_csv('../input/application_train.csv').replace({365243: np.nan})
app_test = pd.read_csv('../input/application_test.csv').replace({365243: np.nan})
bureau = pd.read_csv('../input/bureau.csv').replace({365243: np.nan})
bureau_balance = pd.read_csv('../input/bureau_balance.csv').replace({365243: np.nan})
cash = pd.read_csv('../input/POS_CASH_balance.csv').replace({365243: np.nan})
credit = pd.read_csv('../input/credit_card_balance.csv').replace({365243: np.nan})
previous = pd.read_csv('../input/previous_application.csv').replace({365243: np.nan})
installments = pd.read_csv('../input/installments_payments.csv').replace({365243: np.nan})


app_test['TARGET'] = np.nan
# Join together training and testing
app = app_train.append(app_test, ignore_index = True, sort = True)

def convert_types(df):
def gen_feature_matrix(entityset,
                       features_only=False,
                       feature_matrix_encode=False,
                       saved_features=None):
    '''A function compute and return (feature_matrix, feature_defs) from an featuretools EntitySet

    entityset: the EntitySet to compute features from
    features_only: only return feature_defs, do not actually compute the feature_matrix
    feature_matrix_encode: whether return encoded feature_matrix (Categorical variable one-hot)
    saved_features: load a pre defined feature file and compute feature_matrix based on it
    '''

    if 'goldstandard' in entityset.entity_dict.keys():
        goldstandard_exist = True
        goldstandard_id = 'goldstandard'
    else:
        goldstandard_exist = False
        goldstandard_id = None
    ##FIX manual partition by person_id does NOT improve Dask computing performance
    # ignore 'partition' columns in every entity when building features
    # ignore_variables = dict()
    # for entity in entityset.entities:
    #     if 'partition' in [v.name for v in entity.variables]:
    #         ignore_variables[entity.id] = ['partition']

    ##CAUTION when the entityset is backed by Dask dataframes, only limited set of primitives are supported
    # agg_primitives_all=['avg_time_between', 'count', 'all', 'entropy', 'last', 'num_unique', 'n_most_common',
    #             'min', 'std', 'median', 'mean', 'percent_true', 'trend', 'sum', 'time_since_last', 'any',
    #             'num_true', 'time_since_first', 'first', 'max', 'mode', 'skew']
    # agg_primitives_dask=['count', 'all', 'num_unique', #'n_most_common',
    #               'min', 'std', 'mean', 'percent_true', 'sum', 'any',
    #               'num_true', 'max']

    ## define features per entity(table)
    agg_primitives = [
        'mean', 'max', 'min', 'std', 'last', 'skew', 'time_since_last'
    ]  # 'trend' # trend takes extremely long time to compute
    include_variables = {
        'measurement':
        ['measurement_datetime', 'value_as_number', 'measurement_concept_id'],
        'observation':
        ['observation_concept_id', 'observation_datetime', 'value_as_number']
    }
    agg_primitives_device_exposure = [
        'count', 'avg_time_between', 'time_since_first'
    ]
    include_entities_device_exposure = ['device_exposure']

    trans_primitives = ['age']
    groupby_trans_primitives = []
    include_entities = ['person']
    primitive_options = {
        tuple(trans_primitives): {
            'include_entities': include_entities
        },
        tuple(agg_primitives): {
            'include_variables': include_variables
        },
        tuple(agg_primitives_device_exposure): {
            'include_entities': include_entities_device_exposure
        },
    }
    ignore_entities = [
        goldstandard_id, 'condition_occurrence', 'drug_exposure',
        'observation_period', 'procedure_occurrence', 'visit_occurrence'
    ]
    ignore_variables = {}
    where_primitives = agg_primitives
    entityset['measurement'][
        'measurement_concept_id'].interesting_values = entityset[
            'measurement'].df['measurement_concept_id'].unique()
    entityset['observation'][
        'observation_concept_id'].interesting_values = entityset[
            'observation'].df['observation_concept_id'].unique()
    # if isinstance(entityset.entities[0].df, pandas.DataFrame):
    #     agg_primitives = agg_primitives_all
    # else:
    #     agg_primitives = agg_primitives_dask

    # build features
    if saved_features is None:
        with yaspin(color="yellow") as spinner:
            spinner.write(
                "No features definition file specified, calculating feature matrix from ground zero ... "
            )
            feature_defs = ft.dfs(
                entityset=entityset,
                target_entity="person",
                features_only=True,
                agg_primitives=agg_primitives + agg_primitives_device_exposure,
                trans_primitives=trans_primitives,
                groupby_trans_primitives=groupby_trans_primitives,
                primitive_options=primitive_options,
                ignore_entities=ignore_entities,
                ignore_variables=ignore_variables,
                where_primitives=where_primitives,
                max_depth=2)
            spinner.write("> generated {} features".format(len(feature_defs)))
            if features_only:
                return feature_defs

            tic = time.perf_counter()
            feature_matrix = ft.calculate_feature_matrix(
                feature_defs, entityset)
            if isinstance(entityset.entities[0].df, dd.DataFrame):
                feature_matrix = feature_matrix.compute()
            toc = time.perf_counter()
            spinner.write(
                f"> feature matrix calculate completed in {toc - tic:0.4f} seconds"
            )
            if feature_matrix_encode:
                feature_matrix_enc, features_enc = ft.encode_features(
                    feature_matrix, feature_defs)
                spinner.write(
                    "> generated {} encoded features and the feature matrix".
                    format(len(features_enc)))
            spinner.ok("Done")
    else:
        with yaspin(color="yellow") as spinner:
            spinner.write(
                "Using saved features from {} ... ".format(saved_features))
            feature_defs = ft.load_features(saved_features)
            spinner.write("> {} features loaded from {}".format(
                len(feature_defs), saved_features))

            tic = time.perf_counter()
            feature_matrix = ft.calculate_feature_matrix(
                feature_defs, entityset)
            if isinstance(entityset.entities[0].df, dd.DataFrame):
                feature_matrix = feature_matrix.compute()
            toc = time.perf_counter()
            spinner.write(
                f"> feature matrix calculate complete in {toc - tic:0.4f} seconds"
            )
            spinner.ok("Done")

    if goldstandard_exist:
        if isinstance(entityset.entities[0].df, dd.DataFrame):
            goldstandard = entityset['goldstandard'].df.compute()
        else:
            goldstandard = entityset['goldstandard'].df
    if feature_matrix_encode:
        feature_matrix = feature_matrix_enc
    if goldstandard_exist:
        feature_matrix = feature_matrix.merge(goldstandard,
                                              on='person_id',
                                              how='right')

    return feature_matrix, feature_defs