Exemple #1
0
def test_add_relationship_errors_on_dtype_mismatch(entityset):
    log_2_df = entityset['log'].df.copy()
    log_variable_types = {
        'id': variable_types.Categorical,
        'session_id': variable_types.Id,
        'product_id': variable_types.Id,
        'datetime': variable_types.Datetime,
        'value': variable_types.Numeric,
        'value_2': variable_types.Numeric,
        'latlong': variable_types.LatLong,
        'latlong2': variable_types.LatLong,
        'value_many_nans': variable_types.Numeric,
        'priority_level': variable_types.Ordinal,
        'purchased': variable_types.Boolean,
        'comments': variable_types.Text
    }
    entityset.entity_from_dataframe(entity_id='log2',
                                    dataframe=log_2_df,
                                    index='id',
                                    variable_types=log_variable_types,
                                    time_index='datetime',
                                    encoding='utf-8')
    with pytest.raises(ValueError):
        mismatch = Relationship(entityset[u'régions']['id'],
                                entityset['log2']['session_id'])
        entityset.add_relationship(mismatch)

    with pytest.raises(ValueError):
        mismatch = Relationship(entityset[u'régions']['id'],
                                entityset['log2']['session_id'])
        entityset.add_relationship(mismatch)
Exemple #2
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({"id": [0, 1, 2, 3],
                             "user": [1, 2, 1, 3],
                             "time": [pd.to_datetime('2019-01-10'),
                                      pd.to_datetime('2019-02-03'),
                                      pd.to_datetime('2019-01-01'),
                                      pd.to_datetime('2017-08-25')],
                             "strings": ["I am a string",
                                         "23",
                                         "abcdef ghijk",
                                         ""]})
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_vtypes = {
        "id": ft.variable_types.Id,
        "user": ft.variable_types.Id,
        "time": ft.variable_types.DatetimeTimeIndex,
        "strings": ft.variable_types.NaturalLanguage
    }

    transactions = pd.DataFrame({"id": [0, 1, 2, 3, 4, 5],
                                 "session_id": [0, 0, 1, 2, 2, 3],
                                 "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
                                 "time": [pd.to_datetime('2019-01-10 03:53'),
                                          pd.to_datetime('2019-01-10 04:12'),
                                          pd.to_datetime('2019-02-03 10:34'),
                                          pd.to_datetime('2019-01-01 12:35'),
                                          pd.to_datetime('2019-01-01 12:49'),
                                          pd.to_datetime('2017-08-25 04:53')]})
    transactions_dask = dd.from_pandas(transactions, npartitions=2)
    transactions_vtypes = {
        "id": ft.variable_types.Id,
        "session_id": ft.variable_types.Id,
        "amount": ft.variable_types.Numeric,
        "time": ft.variable_types.DatetimeTimeIndex,
    }

    pd_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions, index="id", time_index="time")
    dask_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions_dask, index="id", time_index="time", variable_types=sessions_vtypes)

    pd_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions, index="id", time_index="time")
    dask_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_dask, index="id", time_index="time", variable_types=transactions_vtypes)

    new_rel = Relationship(pd_es["sessions"]["id"],
                           pd_es["transactions"]["session_id"])
    dask_rel = Relationship(dask_es["sessions"]["id"],
                            dask_es["transactions"]["session_id"])

    pd_es = pd_es.add_relationship(new_rel)
    dask_es = dask_es.add_relationship(dask_rel)

    assert pd_es['sessions'].last_time_index is None
    assert dask_es['sessions'].last_time_index is None

    pd_es.add_last_time_indexes()
    dask_es.add_last_time_indexes()

    pd.testing.assert_series_equal(pd_es['sessions'].last_time_index.sort_index(), dask_es['sessions'].last_time_index.compute(), check_names=False)
Exemple #3
0
def make_ecommerce_entityset(with_integer_time_index=False):
    """ Makes a entityset with the following shape:

          R         Regions
         / \\       .
        S   C       Stores, Customers
            |       .
            S   P   Sessions, Products
             \\ /   .
              L     Log
    """
    dataframes = make_ecommerce_dataframes(
        with_integer_time_index=with_integer_time_index)
    entities = dataframes.keys()
    es_id = 'ecommerce'
    if with_integer_time_index:
        es_id += "_int_time_index"

    variable_types = make_variable_types(
        with_integer_time_index=with_integer_time_index)
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = EntitySet(id=es_id)

    for entity in entities:
        time_index = time_indexes.get(entity, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']
        df = dataframes[entity]
        es.entity_from_dataframe(entity,
                                 df,
                                 index='id',
                                 variable_types=variable_types[entity],
                                 time_index=ti_name,
                                 secondary_time_index=secondary)

    es.normalize_entity('customers',
                        'cohorts',
                        'cohort',
                        additional_variables=['cohort_name'],
                        make_time_index=True,
                        new_entity_time_index='cohort_end')

    es.add_relationships([
        Relationship(es[u'régions']['id'], es['customers'][u'région_id']),
        Relationship(es[u'régions']['id'], es['stores'][u'région_id']),
        Relationship(es['customers']['id'], es['sessions']['customer_id']),
        Relationship(es['sessions']['id'], es['log']['session_id']),
        Relationship(es['products']['id'], es['log']['product_id'])
    ])

    return es
Exemple #4
0
def test_add_relationship_errors_on_dtype_mismatch(es):
    log_2_df = es['log'].df.copy()
    log_variable_types = {
        'id': variable_types.Categorical,
        'session_id': variable_types.Id,
        'product_id': variable_types.Id,
        'datetime': variable_types.Datetime,
        'value': variable_types.Numeric,
        'value_2': variable_types.Numeric,
        'latlong': variable_types.LatLong,
        'latlong2': variable_types.LatLong,
        'value_many_nans': variable_types.Numeric,
        'priority_level': variable_types.Ordinal,
        'purchased': variable_types.Boolean,
        'comments': variable_types.Text
    }
    es.entity_from_dataframe(entity_id='log2',
                             dataframe=log_2_df,
                             index='id',
                             variable_types=log_variable_types,
                             time_index='datetime')

    error_text = u'Unable to add relationship because id in customers is Pandas dtype category and session_id in log2 is Pandas dtype int64.'
    with pytest.raises(ValueError, match=error_text):
        mismatch = Relationship(es[u'customers']['id'], es['log2']['session_id'])
        es.add_relationship(mismatch)
Exemple #5
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5],
        "card_id": [1, 1, 5, 1, 5],
        "transaction_time":
        pd.to_datetime([
            '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00',
            '2012-3-1 08:00', '2014-4-1 10:00'
        ]),
        "fraud": [True, False, False, False, True]
    })

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.entity_from_dataframe(
        entity_id="transactions",
        dataframe=transactions_df,
        index="id",
        time_index="transaction_time")

    datetime_es = datetime_es.entity_from_dataframe(entity_id="cards",
                                                    dataframe=cards_df,
                                                    index="id")
    relationship = Relationship(datetime_es["cards"]["id"],
                                datetime_es["transactions"]["card_id"])
    datetime_es = datetime_es.add_relationship(relationship)
    datetime_es.add_last_time_indexes()
    return datetime_es
Exemple #6
0
def test_operations_invalidate_metadata(es):
    new_es = EntitySet(id="test")
    # test metadata gets created on access
    assert new_es._data_description is None
    assert new_es.metadata is not None  # generated after access
    assert new_es._data_description is not None
    if not isinstance(es['customers'].df, pd.DataFrame):
        customers_vtypes = es["customers"].variable_types
        customers_vtypes['signup_date'] = variable_types.Datetime
    else:
        customers_vtypes = None
    new_es.entity_from_dataframe("customers",
                                 es["customers"].df,
                                 index=es["customers"].index,
                                 variable_types=customers_vtypes)
    if not isinstance(es['sessions'].df, pd.DataFrame):
        sessions_vtypes = es["sessions"].variable_types
    else:
        sessions_vtypes = None
    new_es.entity_from_dataframe("sessions",
                                 es["sessions"].df,
                                 index=es["sessions"].index,
                                 variable_types=sessions_vtypes)
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    r = Relationship(new_es["customers"]["id"],
                     new_es["sessions"]["customer_id"])
    new_es = new_es.add_relationship(r)
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.normalize_entity("customers", "cohort", "cohort")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es.add_last_time_indexes()
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    # automatically adding interesting values not supported in Dask or Koalas
    if any(isinstance(entity.df, pd.DataFrame) for entity in new_es.entities):
        new_es.add_interesting_values()
        assert new_es._data_description is None
        assert new_es.metadata is not None
        assert new_es._data_description is not None
Exemple #7
0
def test_add_parent_not_index_varible(entityset):
    with pytest.raises(AttributeError):
        entityset.add_relationship(
            Relationship(entityset[u'régions']['language'],
                         entityset['customers'][u'région_id']))
Exemple #8
0
def make_ecommerce_entityset(with_integer_time_index=False, base_path=None, save_files=True, file_location='local',
                             split_by_time=False, compressed=False, entityset_type=EntitySet):
    if file_location == 'local' and save_files:
        filenames = make_ecommerce_files(with_integer_time_index, base_path=base_path, file_location=file_location,
                                         split_by_time=split_by_time, compressed=compressed)
        entities = filenames.keys()
    else:
        entities = [u'régions', 'stores', 'products',
                    'customers', 'sessions', 'log']
        filenames = {e: entity_filename(e, base_path, file_location=file_location,
                                        glob=(split_by_time and e == 'log'),
                                        compressed=compressed)
                     for e in entities}
    id = 'ecommerce'
    if with_integer_time_index:
        id += "_int_time_index"
    if split_by_time:
        id += "_glob"

    variable_types = make_variable_types(
        with_integer_time_index=with_integer_time_index)
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = entityset_type(id=id)

    for entity in entities:
        time_index = time_indexes.get(entity, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']

        df = pd.read_csv(filenames[entity], encoding='utf-8')
        if entity == "customers":
            df["id"] = pd.Categorical(df['id'])
        if entity == 'sessions':
            # This should be changed back when converted to an EntitySet
            df['customer_id'] = pd.Categorical(df['customer_id'])
        if entity is 'log':
            df['latlong'] = df['latlong'].apply(latlong_unstringify)
            df['latlong2'] = df['latlong2'].apply(latlong_unstringify)

        es.entity_from_dataframe(entity,
                                 df,
                                 index='id',
                                 variable_types=variable_types[entity],
                                 encoding='utf-8',
                                 time_index=ti_name,
                                 secondary_time_index=secondary)

    es.normalize_entity('customers', 'cohorts', 'cohort',
                        additional_variables=['cohort_name'],
                        time_index_reduce='last',
                        make_time_index=True,
                        new_entity_time_index='cohort_end')

    es.add_relationships(
        [Relationship(es[u'régions']['id'], es['customers'][u'région_id']),
         Relationship(es[u'régions']['id'], es['stores'][u'région_id']),
         Relationship(es['customers']['id'], es['sessions']['customer_id']),
         Relationship(es['sessions']['id'], es['log']['session_id']),
         Relationship(es['products']['id'], es['log']['product_id'])])

    return es
Exemple #9
0
def test_add_parent_not_index_varible(es):
    error_text = "Parent variable.*is not the index of entity Entity.*"
    with pytest.raises(AttributeError, match=error_text):
        es.add_relationship(Relationship(es[u'régions']['language'],
                                         es['customers'][u'région_id']))
Exemple #10
0
def make_ecommerce_entityset(with_integer_time_index=False,
                             base_path=None,
                             save_files=True,
                             file_location='local',
                             split_by_time=False,
                             compressed=False,
                             entityset_type=EntitySet):
    if file_location == 'local' and save_files:
        filenames = make_ecommerce_files(with_integer_time_index,
                                         base_path=base_path,
                                         file_location=file_location,
                                         split_by_time=split_by_time,
                                         compressed=compressed)
        entities = filenames.keys()
    else:
        entities = [
            'regions', 'stores', 'products', 'customers', 'sessions', 'log'
        ]
        filenames = {
            e: entity_filename(e,
                               base_path,
                               file_location=file_location,
                               glob=(split_by_time and e == 'log'),
                               compressed=compressed)
            for e in entities
        }
    id = 'ecommerce'
    if with_integer_time_index:
        id += "_int_time_index"
    if split_by_time:
        id += "_glob"

    variable_types = make_variable_types(
        with_integer_time_index=with_integer_time_index)
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = entityset_type(id=id)

    for entity in entities:
        time_index = time_indexes.get(entity, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']
        es.entity_from_csv(entity,
                           filenames[entity],
                           index='id',
                           variable_types=variable_types[entity],
                           encoding='utf-8',
                           time_index=ti_name,
                           secondary_time_index=secondary)

    es.normalize_entity('customers',
                        'cohorts',
                        'cohort',
                        additional_variables=['cohort_name'],
                        time_index_reduce='last',
                        make_time_index=True,
                        new_entity_time_index='cohort_end')

    es.add_relationships([
        Relationship(es['regions']['id'], es['customers']['region_id']),
        Relationship(es['regions']['id'], es['stores']['region_id']),
        Relationship(es['customers']['id'], es['sessions']['customer_id']),
        Relationship(es['sessions']['id'], es['log']['session_id']),
        Relationship(es['products']['id'], es['log']['product_id'])
    ])

    return es