Esempio n. 1
0
def test_encode_features_handles_pass_columns(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame(
        {
            'instance_id': range(6),
            'time': entityset['log'].df['datetime'][0:6],
            'label': [i % 2 for i in range(6)]
        },
        columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, cutoff_time)

    assert 'label' in feature_matrix.columns

    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(
        feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    assert 'label' in feature_matrix_encoded.columns
Esempio n. 2
0
def test_copy_features_does_not_copy_entityset(es):
    agg = Sum(es['log']['value'], es['sessions'])
    agg_where = Sum(es['log']['value'], es['sessions'],
                    where=IdentityFeature(es['log']['value']) == 2)
    agg_use_previous = Sum(es['log']['value'], es['sessions'],
                           use_previous='4 days')
    agg_use_previous_where = Sum(es['log']['value'], es['sessions'],
                                 where=IdentityFeature(es['log']['value']) == 2,
                                 use_previous='4 days')
    features = [agg, agg_where, agg_use_previous, agg_use_previous_where]
    in_memory_size = asizeof(locals())
    copied = [f.copy() for f in features]
    new_in_memory_size = asizeof(locals())
    assert new_in_memory_size < 2 * in_memory_size

    for f, c in zip(features, copied):
        assert f.entityset
        assert c.entityset
        assert id(f.entityset) == id(c.entityset)
        if f.where:
            assert c.where
            assert id(f.where.entityset) == id(c.where.entityset)
        for bf, bf_c in zip(f.base_features, c.base_features):
            assert id(bf.entityset) == id(bf_c.entityset)
            if bf.where:
                assert bf_c.where
                assert id(bf.where.entityset) == id(bf_c.where.entityset)
def test_cfm_approximate_correct_ordering():
    trips = {
        'trip_id': [i for i in range(1000)],
        'flight_time':
        [datetime(1998, 4, 2)
         for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)],
        'flight_id': [randint(1, 25) for i in range(1000)],
        'trip_duration': [randint(1, 999) for i in range(1000)]
    }
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    features = dfs(entityset=es, target_entity='trips', features_only=True)
    flight_features = [
        feature for feature in features if isinstance(feature, DirectFeature)
        and isinstance(feature.base_features[0], AggregationPrimitive)
    ]
    property_feature = IdentityFeature(es['trips']['trip_id'])
    # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'],
    #                                     es['flights']),
    #                                 es['trips'])
    cutoff_time = pd.DataFrame.from_dict({
        'instance_id': df['trip_id'],
        'time': df['flight_time']
    })
    time_feature = IdentityFeature(es['trips']['flight_time'])
    feature_matrix = calculate_feature_matrix(flight_features +
                                              [property_feature, time_feature],
                                              es,
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    feature_matrix.index.names = ['instance', 'time']
    assert (np.all(
        feature_matrix.reset_index('time').reset_index()[['instance', 'time']].
        values == feature_matrix[['trip_id', 'flight_time']].values))
    feature_matrix_2 = calculate_feature_matrix(
        flight_features + [property_feature, time_feature],
        es,
        cutoff_time=cutoff_time,
        cutoff_time_in_index=True,
        approximate=Timedelta(2, 'd'))
    feature_matrix_2.index.names = ['instance', 'time']
    assert (np.all(
        feature_matrix_2.reset_index('time').reset_index()[[
            'instance', 'time'
        ]].values == feature_matrix_2[['trip_id', 'flight_time']].values))
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)):
                import pdb
                pdb.set_trace()
            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
Esempio n. 4
0
def test_make_identity(entityset, backend):
    f = IdentityFeature(entityset['log']['datetime'])

    pandas_backend = backend([f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_make_identity(entityset, backend):
    f = IdentityFeature(entityset['log']['datetime'])

    pandas_backend = backend([f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_override_cmp_from_variable(es):
    count_lo = IdentityFeature(es['log']['value']) > 1

    to_test = [False, True, True]

    features = [count_lo]

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    v = df[count_lo.get_name()].values.tolist()
    for i, test in enumerate(to_test):
        assert v[i] == test
def test_override_cmp_from_variable(es):
    count_lo = IdentityFeature(es['log']['value']) > 1

    to_test = [False, True, True]

    features = [count_lo]

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    v = df[count_lo.get_name()].values.tolist()
    for i, test in enumerate(to_test):
        assert v[i] == test
def test_integer_time_index_passes_extra_columns(int_es):
    times = list(range(8, 18)) + list(range(19, 23)) + [25, 24, 23]
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True]
    instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14]
    cutoff_df = pd.DataFrame({'time': times,
                              'instance_id': instances,
                              'labels': labels})
    cutoff_df = cutoff_df[['time', 'instance_id', 'labels']]
    property_feature = IdentityFeature(int_es['log']['value']) > 10

    fm = calculate_feature_matrix([property_feature],
                                  cutoff_time=cutoff_df,
                                  cutoff_time_in_index=True)

    assert (fm[property_feature.get_name()] == fm['labels']).all()
def test_dask_persisted_entityset(entityset, capsys):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9)
                  for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with cluster() as (scheduler, [a, b]):
        dkwargs = {'cluster': scheduler['address']}
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset=entityset,
                                                  cutoff_time=cutoff_time,
                                                  verbose=True,
                                                  chunk_size=.13,
                                                  dask_kwargs=dkwargs,
                                                  approximate='1 hour')
        assert (feature_matrix == labels).values.all()
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset=entityset,
                                                  cutoff_time=cutoff_time,
                                                  verbose=True,
                                                  chunk_size=.13,
                                                  dask_kwargs=dkwargs,
                                                  approximate='1 hour')
        captured = capsys.readouterr()
        assert "Using EntitySet persisted on the cluster as dataset " in captured[
            0]
        assert (feature_matrix == labels).values.all()
Esempio n. 10
0
def test_make_dfeat_of_agg_feat_through_parent(entityset, backend):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \      R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(entityset['stores']['id'])

    store_count_feat = Count(store_id_feat,
                             parent_entity=entityset[u'régions'])

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=entityset['customers'])

    pandas_backend = backend([num_stores_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)
def test_datetime_index_mixed_cutoff(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [17] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True]
    instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14]
    cutoff_df = pd.DataFrame({
        'time': times,
        'instance_id': instances,
        'labels': labels
    })
    cutoff_df = cutoff_df[['time', 'instance_id', 'labels']]
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)

    times[9] = "foobar"
    cutoff_df['time'] = times
    with pytest.raises(ValueError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)

    cutoff_df['time'].iloc[9] = '2018-04-02 18:50:45.453216'
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)

    times[9] = '17'
    cutoff_df['time'] = times
    with pytest.raises(ValueError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)
def test_integer_time_index_mixed_cutoff(int_es):
    times_dt = list(range(
        8, 17)) + [datetime(2011, 1, 1), 19, 20, 21, 22, 25, 24, 23]
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True]
    instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14]
    cutoff_df = pd.DataFrame({
        'time': times_dt,
        'instance_id': instances,
        'labels': labels
    })
    cutoff_df = cutoff_df[['time', 'instance_id', 'labels']]
    property_feature = IdentityFeature(int_es['log']['value']) > 10

    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)

    times_str = list(range(8, 17)) + ["foobar", 19, 20, 21, 22, 25, 24, 23]
    cutoff_df['time'] = times_str
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)

    times_date_str = list(range(
        8, 17)) + ['2018-04-02', 19, 20, 21, 22, 25, 24, 23]
    cutoff_df['time'] = times_date_str
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)

    [19, 20, 21, 22]
    times_int_str = [0, 1, 2, 3, 4, 5, '6', 7, 8, 9, 9, 10, 11, 12, 15, 14, 13]
    times_int_str = list(range(8, 17)) + ['17', 19, 20, 21, 22, 25, 24, 23]
    cutoff_df['time'] = times_int_str
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature], cutoff_time=cutoff_df)
Esempio n. 13
0
def test_saveprogress(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    property_feature = IdentityFeature(entityset['log']['value']) > 10
    save_progress = tempfile.mkdtemp()
    fm_save = calculate_feature_matrix([property_feature],
                                       entityset,
                                       instance_ids=range(17),
                                       cutoff_time=times,
                                       save_progress=save_progress)
    _, _, files = next(os.walk(save_progress))
    files = [os.path.join(save_progress, file) for file in files]
    # there is 17 datetime files created above
    assert len(files) == 17
    list_df = []
    for file_ in files:
        df = pd.read_csv(file_, index_col="id", header=0)
        list_df.append(df)
    merged_df = pd.concat(list_df)
    merged_df.set_index(pd.DatetimeIndex(times), append=True, inplace=True)
    fm_no_save = calculate_feature_matrix([property_feature],
                                          entityset,
                                          instance_ids=range(17),
                                          cutoff_time=times)
    assert np.all((merged_df.sort_index().values) == (fm_save.sort_index().values))
    assert np.all((fm_no_save.sort_index().values) == (fm_save.sort_index().values))
    assert np.all((fm_no_save.sort_index().values) == (merged_df.sort_index().values))
    shutil.rmtree(save_progress)
Esempio n. 14
0
def test_calc_feature_matrix(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2

    property_feature = IdentityFeature(entityset['log']['value']) > 10

    feature_matrix = calculate_feature_matrix([property_feature],
                                              entityset,
                                              instance_ids=range(17),
                                              cutoff_time=times,
                                              verbose=True)

    assert (feature_matrix == labels).values.all()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix('features', entityset, instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([], entityset, instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([1, 2, 3], entityset, instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 entityset,
                                 instance_ids=range(17),
                                 cutoff_time=17)
Esempio n. 15
0
def test_encode_features_catches_features_mismatch(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])
    f3 = IdentityFeature(entityset["log"]["session_id"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame({'instance_id': range(6),
                                'time': entityset['log'].df['datetime'][0:6],
                                'label': [i % 2 for i in range(6)]},
                               columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time)

    assert 'label' in feature_matrix.columns

    with pytest.raises(AssertionError):
        encode_features(feature_matrix, [f1, f3])
def test_diff(es):
    value = IdentityFeature(es['log']['value'])
    customer_id_feat = \
        DirectFeature(es['sessions']['customer_id'],
                      child_entity=es['log'])
    diff1 = Diff(value, es['log']['session_id'])
    diff2 = Diff(value, customer_id_feat)

    pandas_backend = PandasBackend(es, [diff1, diff2])
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)

    val1 = df[diff1.get_name()].values.tolist()
    val2 = df[diff2.get_name()].values.tolist()
    correct_vals1 = [
        np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7
    ]
    correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7]
    for i, v in enumerate(val1):
        v1 = val1[i]
        if np.isnan(v1):
            assert (np.isnan(correct_vals1[i]))
        else:
            assert v1 == correct_vals1[i]
        v2 = val2[i]
        if np.isnan(v2):
            assert (np.isnan(correct_vals2[i]))
        else:
            assert v2 == correct_vals2[i]
Esempio n. 17
0
def test_encodes_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["purchased"])
    f3 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2, f3]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert len(features_encoded) == 6

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=2)
    assert len(features_encoded) == 5

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features,
                                                               include_unknown=False)
    assert len(features_encoded) == 5
Esempio n. 18
0
def test_direct_squared(entityset, backend):
    feature = IdentityFeature(entityset['log']['value'])
    squared = feature * feature
    pandas_backend = backend([feature, squared])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    for i, row in df.iterrows():
        assert (row[0] * row[0]) == row[1]
Esempio n. 19
0
def test_to_encode_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
def test_integer_time_index_datetime_cutoffs(int_es):
    times = [datetime.now()] * 17
    cutoff_df = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(int_es['log']['value']) > 10

    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df,
                                 cutoff_time_in_index=True)
Esempio n. 21
0
def test_make_agg_feat_where_count(entityset, backend):
    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['sessions'],
                     where=IdentityFeature(entityset['log']['product_id']) == 'coke zero')

    pandas_backend = backend([agg_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)

    v = df[agg_feat.get_name()][0]
    assert (v == 3)
def test_isnull_feat(es):
    value = IdentityFeature(es['log']['value'])
    diff = Diff(value, es['log']['session_id'])
    isnull = IsNull(diff)
    features = [isnull]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(15), None)
    # correct_vals_diff = [
    #     np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7]
    correct_vals = [True, False, False, False, False, True, False, False,
                    False, True, True, False, True, False, False]
    values = df[isnull.get_name()].values.tolist()
    assert correct_vals == values
def test_arithmetic_of_transform(es):
    diff1 = Diff(IdentityFeature(es['log']['value']),
                 IdentityFeature(es['log']['product_id']))
    diff2 = Diff(IdentityFeature(es['log']['value_2']),
                 IdentityFeature(es['log']['product_id']))

    to_test = [(Add, [np.nan, 14., -7., 3.]), (Subtract, [np.nan, 6., -3.,
                                                          1.]),
               (Multiply, [np.nan, 40., 10., 2.]),
               (Divide, [np.nan, 2.5, 2.5, 2.])]

    features = []
    for test in to_test:
        features.append(test[0](diff1, diff2))

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 2, 11, 13],
                                               time_last=None)
    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert np.isnan(v.pop(0))
        assert np.isnan(test[1].pop(0))
        assert v == test[1]
def test_integer_time_index(int_es):
    times = list(range(8, 18)) + list(range(19, 26))
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2
    cutoff_df = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(int_es['log']['value']) > 10

    feature_matrix = calculate_feature_matrix([property_feature],
                                              cutoff_time=cutoff_df,
                                              cutoff_time_in_index=True)

    time_level_vals = feature_matrix.index.get_level_values(1).values
    sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort')
    assert (time_level_vals == sorted_df['time'].values).all()
    assert (feature_matrix == labels).values.all()
Esempio n. 25
0
def test_inplace_encodes_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])

    features = [f1]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_shape = feature_matrix.shape
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert feature_matrix_encoded.shape != feature_matrix_shape
    assert feature_matrix.shape == feature_matrix_shape

    # inplace they should be the same
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True)
    assert feature_matrix_encoded.shape == feature_matrix.shape
def test_verbose_cutoff_time_chunks(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9)
                  for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    feature_matrix = calculate_feature_matrix([property_feature],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              chunk_size="cutoff time",
                                              verbose=True)

    assert (feature_matrix == labels).values.all()
def test_parallel_failure_raises_correct_error(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9)
                  for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with pytest.raises(AssertionError):
        calculate_feature_matrix([property_feature],
                                 entityset=entityset,
                                 cutoff_time=cutoff_time,
                                 verbose=True,
                                 chunk_size=.13,
                                 n_jobs=0,
                                 approximate='1 hour')
Esempio n. 28
0
def test_make_agg_feat_multiple_dtypes(entityset, backend):
    compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero'

    agg_feat = Count(entityset['log']['id'],
                     parent_entity=entityset['sessions'],
                     where=compare_prod)

    agg_feat2 = Mode(entityset['log']['product_id'],
                     parent_entity=entityset['sessions'],
                     where=compare_prod)

    pandas_backend = backend([agg_feat, agg_feat2])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)

    v = df[agg_feat.get_name()][0]
    v2 = df[agg_feat2.get_name()][0]
    assert (v == 3)
    assert (v2 == 'coke zero')
Esempio n. 29
0
def test_make_agg_feat_where_count_or_device_type_feat(entityset, backend):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    Count.max_stack_depth = 2
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['sessions'])

    compare_count = log_count_feat > 1
    compare_device_type = IdentityFeature(entityset['sessions']['device_type']) == 1
    or_feat = compare_count.OR(compare_device_type)
    feat = Count(entityset['sessions']['id'],
                 parent_entity=entityset['customers'],
                 where=or_feat)

    pandas_backend = backend([feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    name = feat.get_name()
    instances = df[name]
    assert (instances[0] == 3)