コード例 #1
0
def test_tokenize_entityset(es, int_es):
    dupe = make_ecommerce_entityset()

    # check identitcal entitysets hash to same token
    assert tokenize(es) == tokenize(dupe)

    # not same if product relationship is missing
    productless = make_ecommerce_entityset()
    productless.relationships.pop()
    assert tokenize(es) != tokenize(productless)

    # not same if integer entityset
    assert tokenize(es) != tokenize(int_es)

    # add row to cohorts
    cohorts_df = dupe['cohorts'].df
    new_row = pd.DataFrame(data={
        'cohort': [2],
        'cohort_name': ['On Time Adopters'],
        'cohort_end': [pd.Timestamp('2011-04-08 12:00:00')]
    },
                           columns=['cohort', 'cohort_name', 'cohort_end'],
                           index=[2])
    more_cohorts = cohorts_df.append(new_row, ignore_index=True, sort=True)
    dupe['cohorts'].update_data(more_cohorts)
    assert tokenize(es) == tokenize(dupe)
コード例 #2
0
def test_tokenize_entityset(pd_es, int_es):
    dupe = make_ecommerce_entityset()

    # check identitcal entitysets hash to same token
    assert tokenize(pd_es) == tokenize(dupe)

    # not same if product relationship is missing
    productless = make_ecommerce_entityset()
    productless.relationships.pop()
    assert tokenize(pd_es) != tokenize(productless)

    # not same if integer entityset
    assert tokenize(pd_es) != tokenize(int_es)

    # add row to cohorts
    cohorts_df = dupe["cohorts"]
    new_row = pd.DataFrame(
        data={
            "cohort": [2],
            "cohort_name": None,
            "cohort_end": [pd.Timestamp("2011-04-08 12:00:00")],
        },
        columns=["cohort", "cohort_name", "cohort_end"],
        index=[2],
    )
    more_cohorts = cohorts_df.append(new_row, ignore_index=True, sort=True)
    dupe.replace_dataframe(dataframe_name="cohorts", df=more_cohorts)
    assert tokenize(pd_es) == tokenize(dupe)
コード例 #3
0
def test_eq(es):
    other_es = make_ecommerce_entityset()
    latlong = es['log'].df['latlong'].copy()

    assert es['log'].__eq__(es['log'], deep=True)
    assert es['log'].__eq__(other_es['log'], deep=True)
    assert (es['log'].df['latlong'] == latlong).all()

    other_es['log'].add_interesting_values()
    assert not es['log'].__eq__(other_es['log'], deep=True)

    es['log'].id = 'customers'
    es['log'].index = 'notid'
    assert not es['customers'].__eq__(es['log'], deep=True)

    es['log'].index = 'id'
    assert not es['customers'].__eq__(es['log'], deep=True)

    es['log'].time_index = 'signup_date'
    assert not es['customers'].__eq__(es['log'], deep=True)

    es['log'].secondary_time_index = {
        'cancel_date': ['cancel_reason', 'cancel_date']
    }
    assert not es['customers'].__eq__(es['log'], deep=True)
コード例 #4
0
ファイル: test.py プロジェクト: world4jason/DL-DB
def test_ecommerce():
    es = make_ecommerce_entityset()
    cutoffs = es['log'].df[['session_id', 'datetime']]
    cutoffs = cutoffs.rename(columns={'session_id': 'id'})
    ftens, fl = ft.dfs(entityset=es,
                       cutoff_time=cutoffs,
                       target_entity="sessions",
                       cutoff_time_in_index=True)
    ftens.sort_index(inplace=True)

    ids = ftens.index.get_level_values('id').drop_duplicates()
    n_instances = ids.shape[0]

    labels_binary = [i % 2 for i in range(n_instances)]
    labels_multiclass = np.random.randint(10, size=(n_instances, ))
    labels_regression = np.random.random(size=(n_instances, ))
    labels = pd.DataFrame(
        {
            'label_binary': labels_binary,
            'label_multiclass': labels_multiclass,
            'label_regression': labels_regression
        },
        index=ids)

    ftens = (ftens.reset_index('id', drop=False).merge(labels,
                                                       left_on='id',
                                                       right_index=True,
                                                       how='left').set_index(
                                                           'id', append=True))

    train_ftens, test_ftens = train_test_split(ftens,
                                               test_size=0.4,
                                               shuffle=False)
    train_labels = train_ftens[labels.columns]
    test_labels = test_ftens[labels.columns]
    for c in labels.columns:
        del train_ftens[c]
        del test_ftens[c]

    scores = {}
    scoring_functions = {
        'label_regression': mean_absolute_error,
        'label_binary': roc_auc_score,
        'label_multiclass': f1_macro
    }
    for label_type in labels.columns:
        classes = labels[label_type].unique()
        dl_model = DLDB(regression=label_type == 'label_regression',
                        classes=classes,
                        categorical_max_vocab=10)
        dl_model.fit(train_ftens,
                     train_labels[label_type].values,
                     fl=fl,
                     epochs=1,
                     batch_size=4)
        predictions = dl_model.predict(test_ftens)
        score = scoring_functions[label_type](test_labels[label_type].values,
                                              predictions)
        scores[label_type] = score
    return scores
コード例 #5
0
def test_eq(es):
    other_es = make_ecommerce_entityset()
    latlong = es['log'].df['latlong'].copy()

    assert es['log'].__eq__(es['log'], deep=True)
    assert es['log'].__eq__(other_es['log'], deep=True)
    assert all(to_pandas(es['log'].df['latlong']).eq(to_pandas(latlong)))

    # Test different index
    other_es['log'].index = None
    assert not es['log'].__eq__(other_es['log'])
    other_es['log'].index = 'id'
    assert es['log'].__eq__(other_es['log'])

    # Test different time index
    other_es['log'].time_index = None
    assert not es['log'].__eq__(other_es['log'])
    other_es['log'].time_index = 'datetime'
    assert es['log'].__eq__(other_es['log'])

    # Test different secondary time index
    other_es['customers'].secondary_time_index = {}
    assert not es['customers'].__eq__(other_es['customers'])
    other_es['customers'].secondary_time_index = {
        'cancel_date': ['cancel_reason', 'cancel_date']
    }
    assert es['customers'].__eq__(other_es['customers'])

    original_variables = es['sessions'].variables
    # Test different variable list length
    other_es['sessions'].variables = original_variables[:-1]
    assert not es['sessions'].__eq__(other_es['sessions'])
    # Test different variable list contents
    other_es['sessions'].variables = original_variables[:-1] + [
        original_variables[0]
    ]
    assert not es['sessions'].__eq__(other_es['sessions'])

    # Test different interesting values
    assert es['log'].__eq__(other_es['log'], deep=True)
    other_es['log'].add_interesting_values()
    assert not es['log'].__eq__(other_es['log'], deep=True)

    # Check one with last time index, one without
    other_es['log'].last_time_index = other_es['log'].df['datetime']
    assert not other_es['log'].__eq__(es['log'], deep=True)
    assert not es['log'].__eq__(other_es['log'], deep=True)
    # Both set with different values
    es['log'].last_time_index = other_es['log'].last_time_index + pd.Timedelta(
        '1h')
    assert not other_es['log'].__eq__(es['log'], deep=True)

    # Check different dataframes
    other_es['stores'].df = other_es['stores'].df.head(0)
    assert not other_es['stores'].__eq__(es['stores'], deep=True)
コード例 #6
0
def create_feature_matrix():
    es = make_ecommerce_entityset()
    f1 = ft.Feature(es["log"]["product_id"])
    f2 = ft.Feature(es["log"]["value"])
    features = [f1, f2]
    ids = [0, 1, 2, 3, 4, 5]
    feature_matrix = ft.calculate_feature_matrix(features,
                                                 es,
                                                 instance_ids=ids)

    return feature_matrix, features, f1, f2, es, ids
コード例 #7
0
def test_multi_output_selection():
    df1 = pd.DataFrame({"id": [0, 1, 2, 3]})

    df2 = pd.DataFrame({
        "first_id": [0, 1, 1, 3],
        "all_nulls": [None, None, None, None],
        "quarter": ["a", "b", None, "c"],
    })

    dataframes = {
        "first": (df1, "id"),
        "second": (df2, "index"),
    }

    relationships = [("first", "id", "second", "first_id")]
    es = ft.EntitySet("data", dataframes, relationships=relationships)
    es["second"].ww.set_types(logical_types={
        "all_nulls": "categorical",
        "quarter": "categorical"
    })

    fm, features = ft.dfs(
        entityset=es,
        target_dataframe_name="first",
        trans_primitives=[],
        agg_primitives=["n_most_common"],
        max_depth=2,
    )

    multi_output, multi_output_features = ft.selection.remove_single_value_features(
        fm, features)
    assert multi_output.columns == ["N_MOST_COMMON(second.quarter)[0]"]
    assert len(multi_output_features) == 1
    assert multi_output_features[0].get_name() == multi_output.columns[0]

    es = make_ecommerce_entityset()
    fm, features = ft.dfs(
        entityset=es,
        target_dataframe_name="régions",
        trans_primitives=[],
        agg_primitives=["n_most_common"],
        max_depth=2,
    )

    matrix_with_slices, unsliced_features = ft.selection.remove_highly_null_features(
        fm, features)
    assert len(matrix_with_slices.columns) == 18
    assert len(unsliced_features) == 14

    matrix_columns = set(matrix_with_slices.columns)
    for f in unsliced_features:
        for f_name in f.get_feature_names():
            assert f_name in matrix_columns
コード例 #8
0
def test_multi_output_selection():
    df1 = pd.DataFrame({'id': [0, 1, 2, 3]})

    df2 = pd.DataFrame({
        'first_id': [0, 1, 1, 3],
        "all_nulls": [None, None, None, None],
        'quarter': ['a', 'b', None, 'c']
    })

    dataframes = {
        "first": (df1, 'id'),
        "second": (df2, 'index'),
    }

    relationships = [("first", 'id', 'second', 'first_id')]
    es = ft.EntitySet("data", dataframes, relationships=relationships)
    es['second'].ww.set_types(logical_types={
        'all_nulls': 'categorical',
        'quarter': 'categorical'
    })

    fm, features = ft.dfs(entityset=es,
                          target_dataframe_name="first",
                          trans_primitives=[],
                          agg_primitives=['n_most_common'],
                          max_depth=2)

    multi_output, multi_output_features = ft.selection.remove_single_value_features(
        fm, features)
    assert multi_output.columns == ['N_MOST_COMMON(second.quarter)[0]']
    assert len(multi_output_features) == 1
    assert multi_output_features[0].get_name() == multi_output.columns[0]

    es = make_ecommerce_entityset()
    fm, features = ft.dfs(entityset=es,
                          target_dataframe_name="régions",
                          trans_primitives=[],
                          agg_primitives=['n_most_common'],
                          max_depth=2)

    matrix_with_slices, unsliced_features = ft.selection.remove_highly_null_features(
        fm, features)
    assert len(matrix_with_slices.columns) == 18
    assert len(unsliced_features) == 14

    matrix_columns = set(matrix_with_slices.columns)
    for f in unsliced_features:
        for f_name in f.get_feature_names():
            assert f_name in matrix_columns
コード例 #9
0
ファイル: conftest.py プロジェクト: youko70s/featuretools
def es():
    return make_ecommerce_entityset()
コード例 #10
0
ファイル: conftest.py プロジェクト: youko70s/featuretools
def int_es():
    return make_ecommerce_entityset(with_integer_time_index=True)
コード例 #11
0
ファイル: test_utils.py プロジェクト: RomaKoks/nlp_primitives
 def es(self):
     es = make_ecommerce_entityset()
     return es
コード例 #12
0
def string_count_get_name(self):
    return u"STRING_COUNT(%s, %s)" % (self.base_features[0].get_name(),
                                      '"' + str(self.kwargs['string'] + '"'))


# %%
StringCount = make_trans_primitive(function=string_count,
                                   input_types=[Text],
                                   return_type=Numeric,
                                   cls_attributes={"get_name": string_count_get_name})

# %%
from featuretools.tests.testing_utils import make_ecommerce_entityset

es = make_ecommerce_entityset()
count_the_feat = StringCount(es['log']['comments'], string="the")


# 原始日志数据
# %%
es['log'].df.head()
# %% md
# 统计日志表的评论字段出现the的求和值、平均值、标准差
# %%
feature_matrix, features = ft.dfs(entityset=es,
                                  target_entity="sessions",
                                  agg_primitives=[Sum, Mean, Std],
                                  seed_features=[count_the_feat])
feature_matrix[['STD(log.STRING_COUNT(comments, "the"))', 'SUM(log.STRING_COUNT(comments, "the"))',
                'MEAN(log.STRING_COUNT(comments, "the"))']]
コード例 #13
0
ファイル: test_selection.py プロジェクト: pcuong310/Test
def es(feature_matrix):
    es = make_ecommerce_entityset()
    es.entity_from_dataframe('test', feature_matrix, index='test')
    return es
コード例 #14
0
def es(feature_matrix):
    es = make_ecommerce_entityset()
    es.entity_from_dataframe('test', feature_matrix, index='test')
    return es