def make_dataset(name, data, table_name=None, entity_columns=None, sequence_index=None, datasets_path='.'): """Make a Dataset from a DataFrame. Args: name (str): Name of this dataset. data (pandas.DataFrame or str): Data passed as a DataFrame or as a path to a CSV file. table_name (str or None): Optionally give the table a different name. entity_columns (list or None): (Optional) List of names of the columns that form the entity_id of this dataset. If ``None`` (default), no entity columns are set. sequence_index (str or None): (Optional) Name of the column that is the sequence index of this dataset. datasets_path (str): (Optional) Path to the folder in which a new folder will be created for this dataset. Defaults to the current working directory. """ if isinstance(data, str): data = pd.read_csv(data) base_path = os.path.join(datasets_path, name) if os.path.exists(base_path): shutil.rmtree(base_path) os.makedirs(base_path, exist_ok=True) table_name = table_name or name cwd = os.getcwd() try: os.chdir(base_path) csv_name = table_name + '.csv' data.to_csv(csv_name, index=False) metadata = Metadata() metadata.add_table(name, csv_name) meta_dict = metadata.to_dict() table_meta = meta_dict['tables'][table_name] table_meta['entity_columns'] = entity_columns or [] table_meta['sequence_index'] = sequence_index table_meta['deepecho_version'] = Dataset.VERSION with open('metadata.json', 'w') as metadata_file: json.dump(meta_dict, metadata_file, indent=4) LOGGER.info('Dataset %s generated in folder %s', name, base_path) finally: os.chdir(cwd)
def load_multi_foreign_key(): parent = pd.DataFrame({ 'parent_id': range(10), 'value': range(10) }) child = pd.DataFrame({ 'parent_1_id': range(10), 'parent_2_id': range(10), 'value': range(10) }) metadata = Metadata() metadata.add_table('parent', parent, primary_key='parent_id') metadata.add_table('child', child, parent='parent', foreign_key='parent_1_id') metadata.add_relationship('parent', 'child', 'parent_2_id') return metadata, {'parent': parent, 'child': child}
def test_evaluate_tables_from_demo(): tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') sdv = SDV() sdv.fit(new_meta, tables=tables) sampled = sdv.sample_all() table_scores = dict() for table in new_meta.get_tables(): table_scores[table] = evaluate(sampled[table], real=tables[table], metadata=new_meta, table_name=table) evaluate(sampled, real=tables, metadata=new_meta)
def test_build_demo_metadata_from_tables(): """Build metadata from the demo tables. Then compare the built metadata with the demo one to make sure that they are the same. """ tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%dT%H:%M' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') assert DEMO_METADATA == new_meta.to_dict()
def test_build_demo_metadata_without_tables(): metadata = Metadata() metadata.add_table('users') metadata.add_field('users', 'user_id', 'id', 'integer') metadata.add_field('users', 'country', 'categorical') metadata.add_field('users', 'gender', 'categorical') metadata.add_field('users', 'age', 'numerical', 'integer') metadata.set_primary_key('users', 'user_id') metadata.add_table('sessions') metadata.add_field('sessions', 'session_id', 'id', 'integer') metadata.add_field('sessions', 'user_id', 'id', 'integer') metadata.add_field('sessions', 'device', 'categorical') metadata.add_field('sessions', 'os', 'categorical') metadata.add_field('sessions', 'minutes', 'numerical', 'integer') metadata.set_primary_key('sessions', 'session_id') metadata.add_relationship('users', 'sessions') metadata.add_table('transactions') metadata.add_field('transactions', 'transaction_id', 'id', 'integer') metadata.add_field('transactions', 'session_id', 'id', 'integer') metadata.add_field('transactions', 'timestamp', 'datetime', properties={'format': '%Y-%m-%dT%H:%M'}) metadata.add_field('transactions', 'amount', 'numerical', 'float') metadata.add_field('transactions', 'cancelled', 'boolean') metadata.set_primary_key('transactions', 'transaction_id') metadata.add_relationship('sessions', 'transactions') assert DEMO_METADATA == metadata.to_dict()
pd.DataFrame({ "x": np.random.random(size=size), "y": np.random.normal(size=size, loc=10.0) }) } lq_synthetic = { "table1": pd.DataFrame({ "x": np.random.random(size=size) + np.random.normal(size=size), "y": np.random.normal(size=size, loc=10.0) + np.random.normal(size=size) }) } hq_synthetic = { "table1": pd.DataFrame({ "x": np.random.random(size=size) + np.random.normal(size=size) / 10.0, "y": np.random.normal(size=size, loc=10.0) + np.random.normal(size=size) / 10.0 }) } metadata = Metadata() for table_name, df in tables.items(): metadata.add_table(table_name, data=df) dataset = Dataset(metadata, tables, lq_synthetic, hq_synthetic) dataset.save(os.path.dirname(__file__))
fake.address().replace('\n', '') for _ in range(suppliers.shape[0]) ] suppliers.to_csv('Suppliers.csv', index=False) tables = { 'Products': products, 'Suppliers': suppliers, 'Customers': customers, 'Sales orders': sales_orders, 'Purchase orders': purchase_orders } metadata = Metadata() metadata.add_table(name='Products', data=tables['Products'], primary_key='Product id') metadata.add_table(name='Sales orders', data=tables['Sales orders'], primary_key='Sales order id', foreign_key='Product id', parent='Products') metadata.add_table(name='Purchase orders', data=tables['Purchase orders'], primary_key='Purchase order id', foreign_key='Product id', parent='Products') metadata.add_table(name='Customers', data=tables['Customers'], primary_key='Customer id') metadata.add_relationship(parent='Customers',
dataset_name = dataset_name.replace(".ts", "") dataset_name = dataset_name.replace("_TRAIN", "") dataset_dir = "datasets/%s" % dataset_name os.makedirs(dataset_dir, exist_ok=True) path_to_test = path_to_train.replace("_TRAIN", "_TEST") path_to_csv = os.path.join(dataset_dir, "%s.csv" % dataset_name) path_to_metadata = os.path.join(dataset_dir, "metadata.json") path_to_readme = os.path.join(dataset_dir, "README.md") print(path_to_csv, path_to_metadata, path_to_readme) df = to_our_format(path_to_train, path_to_test) df.to_csv(path_to_csv, index=False) metadata = Metadata() metadata.add_table('data', data=df, primary_key='e_id') metadata.to_json(path_to_metadata) with open(os.path.join(dataset_dir, "task.json"), "wt") as fout: json.dump({ "task_type": "classification", "key": ["e_id"], "target": "ml_class", "ignored": ["tt_split", "s_index"] }, fout) with open(path_to_readme, "wt") as fout: fout.write("""# %s This dataset originates from the Time Series Classification dataset repository (http://www.timeseriesclassification.com/).