Example #1
0
def make_dataset(name,
                 data,
                 table_name=None,
                 entity_columns=None,
                 sequence_index=None,
                 datasets_path='.'):
    """Make a Dataset from a DataFrame.

    Args:
        name (str):
            Name of this dataset.
        data (pandas.DataFrame or str):
            Data passed as a DataFrame or as a path to a CSV file.
        table_name (str or None):
            Optionally give the table a different name.
        entity_columns (list or None):
            (Optional) List of names of the columns that form the entity_id of this
            dataset. If ``None`` (default), no entity columns are set.
        sequence_index (str or None):
            (Optional) Name of the column that is the sequence index of this dataset.
        datasets_path (str):
            (Optional) Path to the folder in which a new folder will be created
            for this dataset. Defaults to the current working directory.
    """
    if isinstance(data, str):
        data = pd.read_csv(data)

    base_path = os.path.join(datasets_path, name)
    if os.path.exists(base_path):
        shutil.rmtree(base_path)

    os.makedirs(base_path, exist_ok=True)

    table_name = table_name or name

    cwd = os.getcwd()
    try:
        os.chdir(base_path)
        csv_name = table_name + '.csv'
        data.to_csv(csv_name, index=False)

        metadata = Metadata()
        metadata.add_table(name, csv_name)
        meta_dict = metadata.to_dict()
        table_meta = meta_dict['tables'][table_name]
        table_meta['entity_columns'] = entity_columns or []
        table_meta['sequence_index'] = sequence_index
        table_meta['deepecho_version'] = Dataset.VERSION

        with open('metadata.json', 'w') as metadata_file:
            json.dump(meta_dict, metadata_file, indent=4)

        LOGGER.info('Dataset %s generated in folder %s', name, base_path)

    finally:
        os.chdir(cwd)
Example #2
0
def load_multi_foreign_key():
    parent = pd.DataFrame({
        'parent_id': range(10),
        'value': range(10)
    })
    child = pd.DataFrame({
        'parent_1_id': range(10),
        'parent_2_id': range(10),
        'value': range(10)
    })

    metadata = Metadata()
    metadata.add_table('parent', parent, primary_key='parent_id')
    metadata.add_table('child', child, parent='parent', foreign_key='parent_1_id')
    metadata.add_relationship('parent', 'child', 'parent_2_id')

    return metadata, {'parent': parent, 'child': child}
Example #3
0
def test_evaluate_tables_from_demo():
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%d'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    sdv = SDV()
    sdv.fit(new_meta, tables=tables)

    sampled = sdv.sample_all()

    table_scores = dict()
    for table in new_meta.get_tables():
        table_scores[table] = evaluate(sampled[table],
                                       real=tables[table],
                                       metadata=new_meta,
                                       table_name=table)

    evaluate(sampled, real=tables, metadata=new_meta)
Example #4
0
def test_build_demo_metadata_from_tables():
    """Build metadata from the demo tables.

    Then compare the built metadata with the demo one
    to make sure that they are the same.
    """
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%dT%H:%M'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    assert DEMO_METADATA == new_meta.to_dict()
Example #5
0
def test_build_demo_metadata_without_tables():
    metadata = Metadata()

    metadata.add_table('users')
    metadata.add_field('users', 'user_id', 'id', 'integer')
    metadata.add_field('users', 'country', 'categorical')
    metadata.add_field('users', 'gender', 'categorical')
    metadata.add_field('users', 'age', 'numerical', 'integer')
    metadata.set_primary_key('users', 'user_id')

    metadata.add_table('sessions')
    metadata.add_field('sessions', 'session_id', 'id', 'integer')
    metadata.add_field('sessions', 'user_id', 'id', 'integer')
    metadata.add_field('sessions', 'device', 'categorical')
    metadata.add_field('sessions', 'os', 'categorical')
    metadata.add_field('sessions', 'minutes', 'numerical', 'integer')
    metadata.set_primary_key('sessions', 'session_id')
    metadata.add_relationship('users', 'sessions')

    metadata.add_table('transactions')
    metadata.add_field('transactions', 'transaction_id', 'id', 'integer')
    metadata.add_field('transactions', 'session_id', 'id', 'integer')
    metadata.add_field('transactions',
                       'timestamp',
                       'datetime',
                       properties={'format': '%Y-%m-%dT%H:%M'})
    metadata.add_field('transactions', 'amount', 'numerical', 'float')
    metadata.add_field('transactions', 'cancelled', 'boolean')
    metadata.set_primary_key('transactions', 'transaction_id')
    metadata.add_relationship('sessions', 'transactions')

    assert DEMO_METADATA == metadata.to_dict()
Example #6
0
    pd.DataFrame({
        "x": np.random.random(size=size),
        "y": np.random.normal(size=size, loc=10.0)
    })
}
lq_synthetic = {
    "table1":
    pd.DataFrame({
        "x":
        np.random.random(size=size) + np.random.normal(size=size),
        "y":
        np.random.normal(size=size, loc=10.0) + np.random.normal(size=size)
    })
}
hq_synthetic = {
    "table1":
    pd.DataFrame({
        "x":
        np.random.random(size=size) + np.random.normal(size=size) / 10.0,
        "y":
        np.random.normal(size=size, loc=10.0) +
        np.random.normal(size=size) / 10.0
    })
}

metadata = Metadata()
for table_name, df in tables.items():
    metadata.add_table(table_name, data=df)
dataset = Dataset(metadata, tables, lq_synthetic, hq_synthetic)
dataset.save(os.path.dirname(__file__))
    fake.address().replace('\n', '') for _ in range(suppliers.shape[0])
]

suppliers.to_csv('Suppliers.csv', index=False)

tables = {
    'Products': products,
    'Suppliers': suppliers,
    'Customers': customers,
    'Sales orders': sales_orders,
    'Purchase orders': purchase_orders
}

metadata = Metadata()
metadata.add_table(name='Products',
                   data=tables['Products'],
                   primary_key='Product id')
metadata.add_table(name='Sales orders',
                   data=tables['Sales orders'],
                   primary_key='Sales order id',
                   foreign_key='Product id',
                   parent='Products')
metadata.add_table(name='Purchase orders',
                   data=tables['Purchase orders'],
                   primary_key='Purchase order id',
                   foreign_key='Product id',
                   parent='Products')
metadata.add_table(name='Customers',
                   data=tables['Customers'],
                   primary_key='Customer id')
metadata.add_relationship(parent='Customers',
Example #8
0
    dataset_name = dataset_name.replace(".ts", "")
    dataset_name = dataset_name.replace("_TRAIN", "")
    dataset_dir = "datasets/%s" % dataset_name
    os.makedirs(dataset_dir, exist_ok=True)

    path_to_test = path_to_train.replace("_TRAIN", "_TEST")
    path_to_csv = os.path.join(dataset_dir, "%s.csv" % dataset_name)
    path_to_metadata = os.path.join(dataset_dir, "metadata.json")
    path_to_readme = os.path.join(dataset_dir, "README.md")
    print(path_to_csv, path_to_metadata, path_to_readme)

    df = to_our_format(path_to_train, path_to_test)
    df.to_csv(path_to_csv, index=False)

    metadata = Metadata()
    metadata.add_table('data', data=df, primary_key='e_id')
    metadata.to_json(path_to_metadata)

    with open(os.path.join(dataset_dir, "task.json"), "wt") as fout:
        json.dump({
            "task_type": "classification",
            "key": ["e_id"],
            "target": "ml_class",
            "ignored": ["tt_split", "s_index"]
        }, fout)

    with open(path_to_readme, "wt") as fout:
        fout.write("""# %s

This dataset originates from the Time Series Classification
dataset repository (http://www.timeseriesclassification.com/).