Exemple #1
0
def test_handles_datetime_format():
    # check if we load according to the format string
    # pass in an ambigious date
    datetime_format = "%d-%m-%Y"
    actual = pd.Timestamp('Jan 2, 2011')
    time_strs = [actual.strftime(datetime_format)] * 3
    df = pd.DataFrame(
        {'id': [0, 1, 2], 'time_format': time_strs, 'time_no_format': time_strs})
    vtypes = {'id': variable_types.Categorical,
              'time_format': (variable_types.Datetime, {"format": datetime_format}),
              'time_no_format': variable_types.Datetime}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(
        entity_id='test_entity',
        index='id',
        variable_types=vtypes,
        dataframe=df)

    col_format = entityset['test_entity'].df['time_format']
    col_no_format = entityset['test_entity'].df['time_no_format']
    # without formatting pandas gets it wrong
    assert (col_no_format != actual).all()

    # with formatting we correctly get jan2
    assert (col_format == actual).all()
Exemple #2
0
def test_converts_variable_type_after_init():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'category': ['a', 'b', 'a'],
                       'ints': ['1', '2', '1']})

    df["category"] = df["category"].astype("category")

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity', index='id',
                                    dataframe=df)
    e = entityset['test_entity']
    df = entityset['test_entity'].df

    e.convert_variable_type('ints', variable_types.Numeric)
    assert isinstance(e['ints'], variable_types.Numeric)
    assert df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics

    e.convert_variable_type('ints', variable_types.Categorical)
    assert isinstance(e['ints'], variable_types.Categorical)

    e.convert_variable_type('ints', variable_types.Ordinal)
    assert isinstance(e['ints'], variable_types.Ordinal)

    e.convert_variable_type('ints', variable_types.Boolean,
                            true_val=1, false_val=2)
    assert isinstance(e['ints'], variable_types.Boolean)
    assert df['ints'].dtype.name == 'bool'
Exemple #3
0
def test_check_variables_and_dataframe():
    # matches
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {'id': variable_types.Categorical,
              'category': variable_types.Categorical}
    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe('test_entity', df, index='id',
                                    variable_types=vtypes)
    assert entityset.entity_dict['test_entity'].variable_types['category'] == variable_types.Categorical
Exemple #4
0
def test_none_index():
    df = pd.DataFrame({'category': [1, 2, 3], 'category2': ['1', '2', '3']})
    vtypes = {'category': variable_types.Categorical, 'category2': variable_types.Categorical}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity',
                                    dataframe=df,
                                    variable_types=vtypes)
    assert entityset['test_entity'].index == 'category'
    assert isinstance(entityset['test_entity']['category'], variable_types.Index)
Exemple #5
0
def test_bad_time_index_variable():
    df = pd.DataFrame({'category': ['a', 'b', 'a']})

    error_text = "Time index not found in dataframe"
    with pytest.raises(LookupError, match=error_text):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe(entity_id='test_entity',
                                        index="id",
                                        dataframe=df,
                                        time_index='time')
Exemple #6
0
def test_handles_datetime_mismatch():
    # can't convert arbitrary strings
    df = pd.DataFrame({'id': [0, 1, 2], 'time': ['a', 'b', 'tomorrow']})
    vtypes = {'id': variable_types.Categorical,
              'time': variable_types.Datetime}

    with pytest.raises(ValueError):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe('test_entity', df, 'id',
                                        time_index='time', variable_types=vtypes)
Exemple #7
0
def test_unknown_index():
    # more variables
    df = pd.DataFrame({'category': ['a', 'b', 'a']})
    vtypes = {'category': variable_types.Categorical}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity',
                                    index='id',
                                    variable_types=vtypes, dataframe=df)
    assert entityset['test_entity'].index == 'id'
    assert entityset['test_entity'].df['id'].tolist() == list(range(3))
Exemple #8
0
def test_doesnt_remake_index():
    # more variables
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})

    error_text = "Cannot make index: index variable already present"
    with pytest.raises(RuntimeError, match=error_text):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe(entity_id='test_entity',
                                        index='id',
                                        make_index=True,
                                        dataframe=df)
Exemple #9
0
def test_datetime64_conversion():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'ints': ['1', '2', '1']})
    df["time"] = pd.Timestamp.now()
    df["time"] = df["time"].astype("datetime64[ns, UTC]")

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity', index='id',
                                    dataframe=df)
    vtype_time_index = variable_types.variable.DatetimeTimeIndex
    entityset['test_entity'].convert_variable_type('time', vtype_time_index)
Exemple #10
0
def test_extra_variable_type():
    # more variables
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {'id': variable_types.Categorical,
              'category': variable_types.Categorical,
              'category2': variable_types.Categorical}

    with pytest.raises(LookupError):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe(entity_id='test_entity',
                                        index='id',
                                        variable_types=vtypes, dataframe=df)
Exemple #11
0
def test_make_index_variable_ordering():
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {'id': variable_types.Categorical,
              'category': variable_types.Categorical}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity',
                                    index='id1',
                                    make_index=True,
                                    variable_types=vtypes,
                                    dataframe=df)
    assert entityset.entity_dict['test_entity'].df.columns[0] == 'id1'
Exemple #12
0
def test_make_time_index_keeps_original_sorting():
    trips = {
        'trip_id': [999 - i for i in range(1000)],
        'flight_time': [datetime(1997, 4, 1) for i in range(1000)],
        'flight_id': [1 for i in range(350)] + [2 for i in range(650)]
    }
    order = [i for i in range(1000)]
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    assert (es['trips'].df['trip_id'] == order).all()
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    assert (es['trips'].df['trip_id'] == order).all()
Exemple #13
0
def test_converts_datetime():
    # string converts to datetime correctly
    # This test fails without defining vtypes.  Entityset
    # infers time column should be numeric type
    times = pd.date_range('1/1/2011', periods=3, freq='H')
    time_strs = times.strftime('%Y-%m-%d')
    df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs})
    vtypes = {'id': variable_types.Categorical,
              'time': variable_types.Datetime}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(
        entity_id='test_entity',
        index='id',
        time_index="time",
        variable_types=vtypes,
        dataframe=df)
    pd_col = entityset['test_entity'].df['time']
    # assert type(entityset['test_entity']['time']) == variable_types.Datetime
    assert type(pd_col[0]) == pd.Timestamp
Exemple #14
0
def test_converts_datetime():
    # string converts to datetime correctly
    # This test fails without defining vtypes.  Entityset
    # infers time column should be numeric type
    times = pd.date_range('1/1/2011', periods=3, freq='H')
    time_strs = times.strftime('%Y-%m-%d')
    df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs})
    vtypes = {
        'id': variable_types.Categorical,
        'time': variable_types.Datetime
    }

    es = EntitySet(id='test')
    es.entity_from_dataframe(entity_id='test_entity',
                             index='id',
                             time_index="time",
                             variable_types=vtypes,
                             dataframe=df)
    pd_col = es['test_entity'].df['time']
    # assert type(es['test_entity']['time']) == variable_types.Datetime
    assert type(pd_col[0]) == pd.Timestamp
Exemple #15
0
def test_already_sorted_parameter():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "transaction_time": [datetime(2014, 4, 6),
                                                         datetime(
                                                             2012, 4, 8),
                                                         datetime(
                                                             2012, 4, 8),
                                                         datetime(
                                                             2013, 4, 8),
                                                         datetime(
                                                             2015, 4, 8),
                                                         datetime(2016, 4, 9)]})

    es = EntitySet(id='test')
    es.entity_from_dataframe('t',
                             transactions_df,
                             index='id',
                             time_index="transaction_time",
                             already_sorted=True)
    times = es["t"].df.transaction_time.tolist()
    assert times == transactions_df.transaction_time.tolist()
Exemple #16
0
def test_create_entity_from_dask_df(pd_es):
    dask_es = EntitySet(id="dask_es")
    log_dask = dd.from_pandas(pd_es["log"].df, npartitions=2)
    dask_es = dask_es.entity_from_dataframe(
        entity_id="log_dask",
        dataframe=log_dask,
        index="id",
        time_index="datetime",
        variable_types=pd_es["log"].variable_types)
    pd.testing.assert_frame_equal(pd_es["log"].df,
                                  dask_es["log_dask"].df.compute(),
                                  check_like=True)
Exemple #17
0
def test_already_sorted_parameter():
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5, 6],
        "transaction_time": [
            datetime(2014, 4, 6),
            datetime(2012, 4, 8),
            datetime(2012, 4, 8),
            datetime(2013, 4, 8),
            datetime(2015, 4, 8),
            datetime(2016, 4, 9)
        ]
    })

    es = EntitySet(id='test')
    es.entity_from_dataframe('t',
                             transactions_df,
                             index='id',
                             time_index="transaction_time",
                             already_sorted=True)
    times = es["t"].df.transaction_time.tolist()
    assert times == transactions_df.transaction_time.tolist()
Exemple #18
0
    def test_calculates_statistics_on_init(self):
        df = pd.DataFrame({'id': [0, 1, 2],
                           'time': [datetime(2011, 4, 9, 10, 31, 3 * i)
                                    for i in range(3)],
                           'category': ['a', 'b', 'a'],
                           'number': [4, 5, 6],
                           'boolean': [True, False, True],
                           'boolean_with_nan': [True, False, np.nan]})
        vtypes = {'id': variable_types.Categorical,
                  'time': variable_types.Datetime,
                  'category': variable_types.Categorical,
                  'number': variable_types.Numeric,
                  'boolean': variable_types.Boolean,
                  'boolean_with_nan': variable_types.Boolean}
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe('stats_test_entity', df, 'id',
                                        variable_types=vtypes)
        e = entityset["stats_test_entity"]
        # numerics don't have nunique or percent_unique defined
        for v in ['time', 'category', 'number']:
            assert e[v].count == 3

        for v in ['time', 'number']:
            with pytest.raises(AttributeError):
                e[v].nunique
            with pytest.raises(AttributeError):
                e[v].percent_unique

        # 'id' column automatically parsed as id
        assert e['id'].count == 3

        # categoricals have nunique and percent_unique defined
        assert e['category'].nunique == 2
        assert e['category'].percent_unique == 2. / 3

        # booleans have count and number of true/false labels defined
        assert e['boolean'].count == 3
        # assert e['boolean'].num_true == 3
        assert e['boolean'].num_true == 2
        assert e['boolean'].num_false == 1
def test_single_table_ks_entityset_dates_not_sorted():
    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({"id": [0, 1, 2, 3],
                       "values": [1, 12, -34, 27],
                       "dates": [pd.to_datetime('2019-01-10'),
                                 pd.to_datetime('2019-02-03'),
                                 pd.to_datetime('2019-01-01'),
                                 pd.to_datetime('2017-08-25')]})

    primitives_list = ['absolute', 'is_weekend', 'year', 'day']
    values_dd = ks.from_pandas(df)
    vtypes = {
        "id": ft.variable_types.Id,
        "values": ft.variable_types.Numeric,
        "dates": ft.variable_types.Datetime,
    }
    ks_es.entity_from_dataframe(entity_id="data",
                                dataframe=values_dd,
                                index="id",
                                time_index="dates",
                                variable_types=vtypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_entity="data",
                      trans_primitives=primitives_list,
                      max_depth=1)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.entity_from_dataframe(entity_id="data",
                                dataframe=df,
                                index="id",
                                time_index="dates")

    fm, _ = ft.dfs(entityset=pd_es,
                   target_entity="data",
                   trans_primitives=primitives_list,
                   max_depth=1)

    pd.testing.assert_frame_equal(fm, ks_fm.to_pandas().set_index('id').loc[fm.index])
def test_create_entity_from_ks_df(pd_es):
    cleaned_df = pd_to_ks_clean(pd_es["log"].df)
    log_ks = ks.from_pandas(cleaned_df)

    ks_es = EntitySet(id="ks_es")
    ks_es = ks_es.entity_from_dataframe(
        entity_id="log_ks",
        dataframe=log_ks,
        index="id",
        time_index="datetime",
        variable_types=pd_es["log"].variable_types
    )
    pd.testing.assert_frame_equal(cleaned_df, ks_es["log_ks"].df.to_pandas(), check_like=True)
Exemple #21
0
def test_converts_variable_types_on_init():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'category': ['a', 'b', 'a'],
                       'category_int': [1, 2, 3],
                       'ints': ['1', '2', '3'],
                       'floats': ['1', '2', '3.0']})
    df["category_int"] = df["category_int"].astype("category")

    vtypes = {'id': variable_types.Categorical,
              'ints': variable_types.Numeric,
              'floats': variable_types.Numeric}
    es = EntitySet(id='test')
    es.entity_from_dataframe(entity_id='test_entity', index='id',
                             variable_types=vtypes, dataframe=df)

    entity_df = es['test_entity'].df
    assert entity_df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics
    assert entity_df['floats'].dtype.name in variable_types.PandasTypes._pandas_numerics

    # this is infer from pandas dtype
    e = es["test_entity"]
    assert isinstance(e['category_int'], variable_types.Categorical)
Exemple #22
0
def test_converts_variable_types_on_init():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'category': ['a', 'b', 'a'],
                       'category_int': [1, 2, 3],
                       'ints': ['1', '2', '3'],
                       'floats': ['1', '2', '3.0']})
    df["category_int"] = df["category_int"].astype("category")

    vtypes = {'id': variable_types.Categorical,
              'ints': variable_types.Numeric,
              'floats': variable_types.Numeric}
    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity', index='id',
                                    variable_types=vtypes, dataframe=df)

    entity_df = entityset['test_entity'].df
    assert entity_df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics
    assert entity_df['floats'].dtype.name in variable_types.PandasTypes._pandas_numerics

    # this is infer from pandas dtype
    e = entityset["test_entity"]
    assert isinstance(e['category_int'], variable_types.Categorical)
Exemple #23
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5],
                                    "card_id": [1, 1, 5, 1, 5],
                                    "transaction_time": pd.to_datetime([
                                        '2011-2-28 04:00', '2012-2-28 05:00',
                                        '2012-2-29 06:00', '2012-3-1 08:00',
                                        '2014-4-1 10:00']),
                                    "fraud": [True, False, False, False, True]})

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.entity_from_dataframe(entity_id="transactions",
                                                    dataframe=transactions_df,
                                                    index="id",
                                                    time_index="transaction_time")

    datetime_es = datetime_es.entity_from_dataframe(entity_id="cards",
                                                    dataframe=cards_df,
                                                    index="id")
    relationship = Relationship(datetime_es["cards"]["id"], datetime_es["transactions"]["card_id"])
    datetime_es = datetime_es.add_relationship(relationship)
    datetime_es.add_last_time_indexes()
    return datetime_es
def test_custom_variable_descriptions():
    class ItemList(Categorical):
        type_string = "item_list"
        _default_pandas_dtype = list

    es = EntitySet()
    variables = {
        'item_list': ItemList,
        'time_index': TimeIndex,
        'index': Index
    }
    dataframe = pd.DataFrame(columns=list(variables))
    es.entity_from_dataframe('custom_variable',
                             dataframe,
                             index='index',
                             time_index='time_index',
                             variable_types=variables)
    entity = es['custom_variable']
    for variable in entity.variables:
        description = variable.to_data_description()
        _variable = deserialize.description_to_variable(description,
                                                        entity=entity)
        assert variable.__eq__(_variable)
Exemple #25
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5, 6],
        "card_id": [1, 2, 1, 3, 4, 5],
        "transaction_time": [10, 12, 13, 20, 21, 20],
        "fraud": [True, False, False, False, True, True]
    })
    accounts_df = pd.DataFrame({
        "id": [3, 4, 5],
        "signup_date":
        [datetime(2002, 5, 1),
         datetime(2006, 3, 20),
         datetime(2011, 11, 11)]
    })
    accounts_df_string = pd.DataFrame({
        "id": [3, 4, 5],
        "signup_date": ["element", "exporting", "editable"]
    })
    # create empty entityset
    es = EntitySet("fraud")
    # assert it's not set
    assert getattr(es, "time_type", None) is None
    # add entity
    es.entity_from_dataframe("transactions",
                             transactions_df,
                             index="id",
                             time_index="transaction_time")
    # assert time_type is set
    assert es.time_type == variable_types.NumericTimeIndex
    # add another entity
    es.normalize_entity("transactions",
                        "cards",
                        "card_id",
                        make_time_index=True)
    # assert time_type unchanged
    assert es.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error_text):
        es.entity_from_dataframe("accounts",
                                 accounts_df,
                                 index="id",
                                 time_index="signup_date")
    # add non time type as time index
    error_text = "Attempted to convert all string column signup_date to numeric"
    with pytest.raises(TypeError, match=error_text):
        es.entity_from_dataframe("accounts",
                                 accounts_df_string,
                                 index="id",
                                 time_index="signup_date")
Exemple #26
0
 def test_sets_time_when_adding_entity(self):
     transactions_df = pd.DataFrame({
         "id": [1, 2, 3, 4, 5, 6],
         "card_id": [1, 2, 1, 3, 4, 5],
         "transaction_time": [10, 12, 13, 20, 21, 20],
         "fraud": [True, False, False, False, True, True]
     })
     accounts_df = pd.DataFrame({
         "id": [3, 4, 5],
         "signup_date": [
             datetime(2002, 5, 1),
             datetime(2006, 3, 20),
             datetime(2011, 11, 11)
         ]
     })
     accounts_df_string = pd.DataFrame({
         "id": [3, 4, 5],
         "signup_date": ["element", "exporting", "editable"]
     })
     # create empty entityset
     entityset = EntitySet("fraud")
     # assert it's not set
     assert getattr(entityset, "time_type", None) is None
     # add entity
     entityset.entity_from_dataframe("transactions",
                                     transactions_df,
                                     index="id",
                                     time_index="transaction_time")
     # assert time_type is set
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add another entity
     entityset.normalize_entity("transactions",
                                "cards",
                                "card_id",
                                make_time_index=True)
     # assert time_type unchanged
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add wrong time type entity
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df,
                                         index="id",
                                         time_index="signup_date")
     # add non time type as time index
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df_string,
                                         index="id",
                                         time_index="signup_date")
Exemple #27
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "card_id": [1, 2, 1, 3, 4, 5],
                                    "transaction_time": [10, 12, 13, 20, 21, 20],
                                    "fraud": [True, False, False, False, True, True]})
    accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                "signup_date": [datetime(2002, 5, 1),
                                                datetime(2006, 3, 20),
                                                datetime(2011, 11, 11)]})
    accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                       "signup_date": ["element",
                                                       "exporting",
                                                       "editable"]})
    # create empty entityset
    entityset = EntitySet("fraud")
    # assert it's not set
    assert getattr(entityset, "time_type", None) is None
    # add entity
    entityset.entity_from_dataframe("transactions",
                                    transactions_df,
                                    index="id",
                                    time_index="transaction_time")
    # assert time_type is set
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add another entity
    entityset.normalize_entity("transactions",
                               "cards",
                               "card_id",
                               make_time_index=True)
    # assert time_type unchanged
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error_text):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df,
                                        index="id",
                                        time_index="signup_date")
    # add non time type as time index
    error_text = "Attempted to convert all string column signup_date to numeric"
    with pytest.raises(TypeError, match=error_text):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df_string,
                                        index="id",
                                        time_index="signup_date")
 def test_sets_time_when_adding_entity(self):
     transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                     "card_id": [1, 2, 1, 3, 4, 5],
                                     "transaction_time": [10, 12, 13, 20, 21, 20],
                                     "fraud": [True, False, False, False, True, True]})
     accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                 "signup_date": [datetime(2002, 5, 1),
                                                 datetime(2006, 3, 20),
                                                 datetime(2011, 11, 11)]})
     accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                        "signup_date": ["element",
                                                        "exporting",
                                                        "editable"]})
     # create empty entityset
     entityset = EntitySet("fraud")
     # assert it's not set
     assert getattr(entityset, "time_type", None) is None
     # add entity
     entityset.entity_from_dataframe("transactions",
                                     transactions_df,
                                     index="id",
                                     time_index="transaction_time")
     # assert time_type is set
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add another entity
     entityset.normalize_entity("transactions",
                                "cards",
                                "card_id",
                                make_time_index=True)
     # assert time_type unchanged
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add wrong time type entity
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df,
                                         index="id",
                                         time_index="signup_date")
     # add non time type as time index
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df_string,
                                         index="id",
                                         time_index="signup_date")
def test_single_table_ks_entityset_cutoff_time_df():
    primitives_list = [
        'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'
    ]

    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [0, 1, 2],
        "values": [1, 12, -34],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk"]
    })
    values_dd = ks.from_pandas(df)
    vtypes = {
        "id": ft.variable_types.Id,
        "values": ft.variable_types.Numeric,
        "dates": ft.variable_types.Datetime,
        "strings": ft.variable_types.NaturalLanguage
    }
    ks_es.entity_from_dataframe(entity_id="data",
                                dataframe=values_dd,
                                index="id",
                                time_index="dates",
                                variable_types=vtypes)
    ids = [0, 1, 2, 0]
    times = [
        pd.Timestamp("2019-01-05 04:00"),
        pd.Timestamp("2019-01-05 04:00"),
        pd.Timestamp("2019-01-05 04:00"),
        pd.Timestamp("2019-01-15 04:00")
    ]
    labels = [True, False, True, False]
    cutoff_times = pd.DataFrame({
        "id": ids,
        "time": times,
        "labels": labels
    },
                                columns=["id", "time", "labels"])

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_entity="data",
                      trans_primitives=primitives_list,
                      cutoff_time=cutoff_times)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.entity_from_dataframe(
        entity_id="data",
        dataframe=df,
        index="id",
        time_index="dates",
        variable_types={"strings": ft.variable_types.NaturalLanguage})

    fm, _ = ft.dfs(entityset=pd_es,
                   target_entity="data",
                   trans_primitives=primitives_list,
                   cutoff_time=cutoff_times)
    # Because row ordering with koalas is not guaranteed, `we need to sort on two columns to make sure that values
    # for instance id 0 are compared correctly. Also, make sure the boolean column has the same dtype.
    fm = fm.sort_values(['id', 'labels'])
    ks_fm = ks_fm.to_pandas().set_index('id').sort_values(['id', 'labels'])
    ks_fm['IS_WEEKEND(dates)'] = ks_fm['IS_WEEKEND(dates)'].astype(
        fm['IS_WEEKEND(dates)'].dtype)
    pd.testing.assert_frame_equal(fm, ks_fm)
Exemple #30
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "user": [1, 2, 1, 3],
        "time": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_vtypes = {
        "id": ft.variable_types.Id,
        "user": ft.variable_types.Id,
        "time": ft.variable_types.DatetimeTimeIndex,
        "strings": ft.variable_types.NaturalLanguage
    }

    transactions = pd.DataFrame({
        "id": [0, 1, 2, 3, 4, 5],
        "session_id": [0, 0, 1, 2, 2, 3],
        "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
        "time": [
            pd.to_datetime('2019-01-10 03:53'),
            pd.to_datetime('2019-01-10 04:12'),
            pd.to_datetime('2019-02-03 10:34'),
            pd.to_datetime('2019-01-01 12:35'),
            pd.to_datetime('2019-01-01 12:49'),
            pd.to_datetime('2017-08-25 04:53')
        ]
    })
    transactions_dask = dd.from_pandas(transactions, npartitions=2)
    transactions_vtypes = {
        "id": ft.variable_types.Id,
        "session_id": ft.variable_types.Id,
        "amount": ft.variable_types.Numeric,
        "time": ft.variable_types.DatetimeTimeIndex,
    }

    pd_es.entity_from_dataframe(entity_id="sessions",
                                dataframe=sessions,
                                index="id",
                                time_index="time")
    dask_es.entity_from_dataframe(entity_id="sessions",
                                  dataframe=sessions_dask,
                                  index="id",
                                  time_index="time",
                                  variable_types=sessions_vtypes)

    pd_es.entity_from_dataframe(entity_id="transactions",
                                dataframe=transactions,
                                index="id",
                                time_index="time")
    dask_es.entity_from_dataframe(entity_id="transactions",
                                  dataframe=transactions_dask,
                                  index="id",
                                  time_index="time",
                                  variable_types=transactions_vtypes)

    new_rel = Relationship(pd_es["sessions"]["id"],
                           pd_es["transactions"]["session_id"])
    dask_rel = Relationship(dask_es["sessions"]["id"],
                            dask_es["transactions"]["session_id"])

    pd_es = pd_es.add_relationship(new_rel)
    dask_es = dask_es.add_relationship(dask_rel)

    assert pd_es['sessions'].last_time_index is None
    assert dask_es['sessions'].last_time_index is None

    pd_es.add_last_time_indexes()
    dask_es.add_last_time_indexes()

    pd.testing.assert_series_equal(
        pd_es['sessions'].last_time_index.sort_index(),
        dask_es['sessions'].last_time_index.compute(),
        check_names=False)