def mock_client(): m = Mock(wraps=SQLAlchemyClient(uri)) m.split_source = ';' return m
import pytest import pandas as pd import numpy as np @pytest.fixture def sample_data(): conn = connect('database.db') df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn, index=False) yield 'database.db' conn.close() @pytest.mark.parametrize('client', [ SQLAlchemyClient('sqlite:///database.db'), DBAPIClient(connect, dict(database='database.db')) ]) def test_sqlscript_load(tmp_directory, sample_data, client): dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client SQLScript('CREATE TABLE {{product}} AS SELECT * FROM numbers', SQLiteRelation((None, 'another', 'table')), dag=dag, name='task') dag.build(close_clients=False)
src = Path(str(upstream.first), filename) shutil.copy(str(src), str(product)) def _clean(upstream, product): df = pd.read_csv(str(upstream.first), skiprows=3) df = df[['Country Name', '2018']] df.rename({'Country Name': 'country', '2018': 'population'}, axis='columns', inplace=True) df.to_csv(str(product), index=False) # https://data.worldbank.org/indicator/sp.pop.totl dag = DAG() client = SQLAlchemyClient('sqlite:///metadata.db') dag.clients[GenericProduct] = client source = 'http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv' compressed = DownloadFromURL(source, File(ROOT / 'population.zip'), dag, name='compressed') uncompress = PythonCallable(_uncompress, File(ROOT / 'population/'), dag, name='uncompress') get_from_folder = PythonCallable(_get_from_folder, File(ROOT / 'population.csv'), dag, name='get_from_folder') clean = PythonCallable(_clean, File( ROOT / 'population_clean.csv'), dag, name='pop_clean') compressed >> uncompress >> get_from_folder >> clean # deaths
def get_client(): return SQLAlchemyClient('sqlite:///data.db')
def test_send_more_than_one_command_in_sqlite(code, split_source, tmp_directory): client = SQLAlchemyClient('sqlite:///my_db.db', split_source=split_source) client.execute(code)
def make(date_): date_str = date_.strftime('%Y.%m.%d') ROOT = Path('data', date_str) ROOT.mkdir(exist_ok=True, parents=True) dag = DAG() client = SQLAlchemyClient('sqlite:///metadata.db') dag.clients[GenericProduct] = client loader = SourceLoader(path='.') source = 'https://www.inegi.org.mx/contenidos/programas/ccpv/2010/datosabiertos/iter_nal_2010_csv.zip' population_zip = DownloadFromURL(source, File(ROOT / 'population.zip'), dag, name='population.zip') population = PythonCallable(_uncompress, File(ROOT / 'population'), dag, name='population') pop_by_state = PythonCallable(_pop_by_state, File(ROOT / 'pop_by_state.csv'), dag, name='pop_by_state') population_zip >> population >> pop_by_state confirmed = ShellScript(loader['get_confirmed.sh'], { 'pdf': File(ROOT / 'confirmed.pdf'), 'csv': File(ROOT / 'confirmed.csv') }, dag, params={'date_str': date_str}) suspected = ShellScript(loader['get_suspected.sh'], { 'pdf': File(ROOT / 'suspected.pdf'), 'csv': File(ROOT / 'suspected.csv') }, dag, params={'date_str': date_str}) confirmed_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Confirmado)' ) suspected_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s?(Sospechoso)' ) clean_confirmed = PythonCallable(_clean, File(ROOT / 'confirmed_clean.csv'), dag, name='clean_confirmed', params={'regex': confirmed_regex}) clean_suspected = PythonCallable(_clean, File(ROOT / 'suspected_clean.csv'), dag, name='clean_suspected', params={'regex': suspected_regex}) agg = PythonCallable(_agg, File(ROOT / 'cases_and_population.csv'), dag, name='cases_pop') confirmed >> clean_confirmed >> agg suspected >> clean_suspected >> agg pop_by_state >> agg if args.upload: upload_confirmed = UploadToS3( '{{upstream["clean_confirmed"]}}', GenericProduct( 'mx-health-ministry/{}/confirmed.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_confirmed') upload_suspected = UploadToS3( '{{upstream["clean_suspected"]}}', GenericProduct( 'mx-health-ministry/{}/suspected.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_suspected') clean_confirmed >> upload_confirmed clean_suspected >> upload_suspected upload_agg = UploadToS3( '{{upstream["cases_pop"]}}', GenericProduct( 'mx-health-ministry/{}/cases_pop.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_cases_pop') agg >> upload_agg return dag
def get_client(): return SQLAlchemyClient('sqlite://')
def get_metadata_client(): return SQLAlchemyClient('sqlite:///metadata.db')
def test_creates_absolute_dir_sqlalchemyclient(tmp_directory): intermediate_path = Path(tmp_directory, "an/absolute/path") client = SQLAlchemyClient(f'sqlite:///{intermediate_path}/my_db.db') client.execute('CREATE TABLE my_table (num INT)') assert intermediate_path.exists()
def test_does_not_create_in_memory_sqlalchemyclient(tmp_directory): client = SQLAlchemyClient('sqlite://') client.execute('CREATE TABLE my_table (num INT)') # Assert no folder/file was created in the temporary folder: assert next(Path(tmp_directory).iterdir(), None) is None
def test_creates_relative_dir_sqlalchemyclient(tmp_directory): intermediate_path = "a/relative/path" client = SQLAlchemyClient(f'sqlite:///{intermediate_path}/my_db.db') client.execute('CREATE TABLE my_table (num INT)') assert Path(tmp_directory + "/" + intermediate_path).exists()
def test_pickle_sqlalchemyclient(tmp_directory): client = SQLAlchemyClient('sqlite:///my_db.db') client.execute('CREATE TABLE my_table (num INT)') assert pickle.dumps(client)
def test_deepcopy_sqlalchemyclient(tmp_directory): client = SQLAlchemyClient('sqlite:///my_db.db') client.execute('CREATE TABLE my_table (num INT)') assert copy.deepcopy(client)
def test_sql_runtime_params(tmp_directory): tmp = Path(tmp_directory) # create a db conn = connect(str(tmp / "database.db")) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) # make some data and save it in the db df = pd.DataFrame({ 'number': range(10), 'char': list(string.ascii_letters[:10]) }) df.to_sql('data', conn) # create the task and run it dag = DAG(executor=Serial(build_in_subprocess=False)) t1 = SQLDump('SELECT * FROM data', File('data.parquet'), dag, name='data', client=client, chunksize=None, io_handler=io.ParquetIO) def select(product, upstream): numbers = list(pd.read_parquet(upstream['data']).number) numbers_selected = [n for n in numbers if n % 2 == 0] chars = list(pd.read_parquet(upstream['data']).char) chars_selected = [repr(c) for i, c in enumerate(chars) if i % 2 == 0] Path(product).write_text( json.dumps(dict(numbers=numbers_selected, chars=chars_selected))) t2 = PythonCallable(select, File('selected.json'), dag, name='selected') t3 = SQLDump(""" SELECT * FROM data WHERE number NOT IN ([[get_key(upstream["selected"], "numbers") | join(", ") ]]) """, File('numbers.parquet'), dag, name='numbers', client=client, chunksize=None, io_handler=io.ParquetIO) t4 = SQLDump(""" SELECT * FROM data WHERE char NOT IN ([[get_key(upstream["selected"], "chars") | join(", ") ]]) """, File('chars.parquet'), dag, name='chars', client=client, chunksize=None, io_handler=io.ParquetIO) t1 >> t2 >> t3 t2 >> t4 dag.build() assert list(pd.read_parquet('numbers.parquet').number) == [1, 3, 5, 7, 9] assert list( pd.read_parquet('chars.parquet').char) == ['b', 'd', 'f', 'h', 'j']
############################################################################### # first generate some sample data, one daily observation from 2010 to 2020 dates = pd.date_range('2010', '2020', freq='D') df = pd.DataFrame({'date': dates, 'x': np.random.rand(len(dates))}) conn = sqlite3.connect(str(path_to_db)) df.to_sql('data', conn) conn.close() ############################################################################### # We now build the DAG dag = DAG() dag.clients[SQLDump] = SQLAlchemyClient('sqlite:///' + str(path_to_db)) def make_task(date_start, date_end, path, dag): """Task factory: returns a task that dumps certain year """ sql = """ SELECT * FROM data WHERE DATE('{{date_start}}') <= date AND date < DATE('{{date_end}}') """ name = f'{date_start}-to-{date_end}.csv' return SQLDump(sql, product=File(Path(path / name)), dag=dag, name=f'dump_{name}', params={
def get_client(): return SQLAlchemyClient(get_uri())
def make(date_): date_str = date_.strftime('%Y.%m.%d') ROOT = Path('data', date_str) ROOT.mkdir(exist_ok=True, parents=True) dag = DAG() client = SQLAlchemyClient('sqlite:///metadata.db') dag.clients[GenericProduct] = client loader = SourceLoader(path='.') confirmed = ShellScript(loader['get_confirmed.sh'], { 'pdf': File(ROOT / 'confirmed.pdf'), 'csv': File(ROOT / 'confirmed.csv') }, dag, params={'date_str': date_str}) suspected = ShellScript(loader['get_suspected.sh'], { 'pdf': File(ROOT / 'suspected.pdf'), 'csv': File(ROOT / 'suspected.csv') }, dag, params={'date_str': date_str}) confirmed_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+),(.+),(Confirmado)' ) suspected_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Sospechoso)' ) clean_confirmed = PythonCallable(_clean, File(ROOT / 'confirmed_clean.csv'), dag, name='clean_confirmed', params={'regex': confirmed_regex}) clean_suspected = PythonCallable(_clean, File(ROOT / 'suspected_clean.csv'), dag, name='clean_suspected', params={'regex': suspected_regex}) confirmed >> clean_confirmed suspected >> clean_suspected if args.upload: upload_confirmed = UploadToS3( '{{upstream["clean_confirmed"]}}', GenericProduct( 'mx-health-ministry/{}/confirmed.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_confirmed') upload_suspected = UploadToS3( '{{upstream["clean_suspected"]}}', GenericProduct( 'mx-health-ministry/{}/suspected.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_suspected') clean_confirmed >> upload_confirmed clean_suspected >> upload_suspected return dag
############################################################################### # There are three core concepts in ``ploomber``: :class:`Tasks <ploomber.tasks>`, # :class:`Products <ploomber.products>` and :class:`DAGs <ploomber.DAG>`. Tasks # are units of work that generate Products (which are persistent changes on # disk). Tasks are organized into a DAG which keeps track of declared # dependencies among them. dag = DAG() # the first task dumps data from the db to the local filesystem task_dump = SQLDump('SELECT * FROM example', File(tmp_dir / 'example.csv'), dag, name='dump', client=SQLAlchemyClient(uri), chunksize=None) # since this task will have an upstream dependency, it has to accept the # upstream parameter, all tasks must accept a product parameter def _add_one(upstream, product): """Add one to column a """ df = pd.read_csv(str(upstream['dump'])) df['a'] = df['a'] + 1 df.to_csv(str(product), index=False) # we convert the Python function into a Task task_add_one = PythonCallable(_add_one, File(tmp_dir / 'add_one.csv'),
from pathlib import Path import tempfile import pandas as pd from ploomber.clients import SQLAlchemyClient from ploomber.tasks import SQLTransfer, SQLScript from ploomber.products import SQLiteRelation from ploomber import DAG, SourceLoader ############################################################################### # We first setup our sample environment, a sqlite database with some data tmp_dir = Path(tempfile.mkdtemp()) client = SQLAlchemyClient('sqlite:///' + str(tmp_dir / 'my_db.db')) df = pd.DataFrame({'x': range(10)}) df.to_sql('data', client.engine) ############################################################################### # We now simulate our code environment: a folder with SQL scripts, for # simplicity, we are saving them in the same location as the data but in a real # project we should keep and data separate _ = Path(tmp_dir, 'data_select.sql').write_text('SELECT * FROM data') ############################################################################### # Unde the hood SourceLoader initializes a jinja2.Environment which allows us # to use features such as macros Path(tmp_dir, 'macros.sql').write_text(""" {% macro my_macro() -%}