Exemple #1
0
def test_build_demo_metadata_without_tables():
    metadata = Metadata()

    metadata.add_table('users')
    metadata.add_field('users', 'user_id', 'id', 'integer')
    metadata.add_field('users', 'country', 'categorical')
    metadata.add_field('users', 'gender', 'categorical')
    metadata.add_field('users', 'age', 'numerical', 'integer')
    metadata.set_primary_key('users', 'user_id')

    metadata.add_table('sessions')
    metadata.add_field('sessions', 'session_id', 'id', 'integer')
    metadata.add_field('sessions', 'user_id', 'id', 'integer')
    metadata.add_field('sessions', 'device', 'categorical')
    metadata.add_field('sessions', 'os', 'categorical')
    metadata.add_field('sessions', 'minutes', 'numerical', 'integer')
    metadata.set_primary_key('sessions', 'session_id')
    metadata.add_relationship('users', 'sessions')

    metadata.add_table('transactions')
    metadata.add_field('transactions', 'transaction_id', 'id', 'integer')
    metadata.add_field('transactions', 'session_id', 'id', 'integer')
    metadata.add_field('transactions',
                       'timestamp',
                       'datetime',
                       properties={'format': '%Y-%m-%dT%H:%M'})
    metadata.add_field('transactions', 'amount', 'numerical', 'float')
    metadata.add_field('transactions', 'cancelled', 'boolean')
    metadata.set_primary_key('transactions', 'transaction_id')
    metadata.add_relationship('sessions', 'transactions')

    assert DEMO_METADATA == metadata.to_dict()
Exemple #2
0
def test_build_demo_metadata_from_tables():
    """Build metadata from the demo tables.

    Then compare the built metadata with the demo one
    to make sure that they are the same.
    """
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%dT%H:%M'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    assert DEMO_METADATA == new_meta.to_dict()
Exemple #3
0
def make_dataset(name,
                 data,
                 table_name=None,
                 entity_columns=None,
                 sequence_index=None,
                 datasets_path='.'):
    """Make a Dataset from a DataFrame.

    Args:
        name (str):
            Name of this dataset.
        data (pandas.DataFrame or str):
            Data passed as a DataFrame or as a path to a CSV file.
        table_name (str or None):
            Optionally give the table a different name.
        entity_columns (list or None):
            (Optional) List of names of the columns that form the entity_id of this
            dataset. If ``None`` (default), no entity columns are set.
        sequence_index (str or None):
            (Optional) Name of the column that is the sequence index of this dataset.
        datasets_path (str):
            (Optional) Path to the folder in which a new folder will be created
            for this dataset. Defaults to the current working directory.
    """
    if isinstance(data, str):
        data = pd.read_csv(data)

    base_path = os.path.join(datasets_path, name)
    if os.path.exists(base_path):
        shutil.rmtree(base_path)

    os.makedirs(base_path, exist_ok=True)

    table_name = table_name or name

    cwd = os.getcwd()
    try:
        os.chdir(base_path)
        csv_name = table_name + '.csv'
        data.to_csv(csv_name, index=False)

        metadata = Metadata()
        metadata.add_table(name, csv_name)
        meta_dict = metadata.to_dict()
        table_meta = meta_dict['tables'][table_name]
        table_meta['entity_columns'] = entity_columns or []
        table_meta['sequence_index'] = sequence_index
        table_meta['deepecho_version'] = Dataset.VERSION

        with open('metadata.json', 'w') as metadata_file:
            json.dump(meta_dict, metadata_file, indent=4)

        LOGGER.info('Dataset %s generated in folder %s', name, base_path)

    finally:
        os.chdir(cwd)