Ejemplo n.º 1
0
 def mock_client():
     m = Mock(wraps=SQLAlchemyClient(uri))
     m.split_source = ';'
     return m
Ejemplo n.º 2
0
import pytest
import pandas as pd
import numpy as np


@pytest.fixture
def sample_data():
    conn = connect('database.db')
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn, index=False)
    yield 'database.db'
    conn.close()


@pytest.mark.parametrize('client', [
    SQLAlchemyClient('sqlite:///database.db'),
    DBAPIClient(connect, dict(database='database.db'))
])
def test_sqlscript_load(tmp_directory, sample_data, client):

    dag = DAG()

    dag.clients[SQLScript] = client
    dag.clients[SQLiteRelation] = client

    SQLScript('CREATE TABLE {{product}} AS SELECT * FROM numbers',
              SQLiteRelation((None, 'another', 'table')),
              dag=dag,
              name='task')

    dag.build(close_clients=False)
Ejemplo n.º 3
0
    src = Path(str(upstream.first), filename)
    shutil.copy(str(src), str(product))


def _clean(upstream, product):
    df = pd.read_csv(str(upstream.first), skiprows=3)
    df = df[['Country Name', '2018']]
    df.rename({'Country Name': 'country', '2018': 'population'},
              axis='columns', inplace=True)
    df.to_csv(str(product), index=False)


# https://data.worldbank.org/indicator/sp.pop.totl
dag = DAG()

client = SQLAlchemyClient('sqlite:///metadata.db')
dag.clients[GenericProduct] = client

source = 'http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv'
compressed = DownloadFromURL(source, File(ROOT / 'population.zip'), dag,
                             name='compressed')
uncompress = PythonCallable(_uncompress, File(ROOT / 'population/'), dag,
                            name='uncompress')
get_from_folder = PythonCallable(_get_from_folder, File(ROOT / 'population.csv'), dag,
                                 name='get_from_folder')
clean = PythonCallable(_clean, File(
    ROOT / 'population_clean.csv'), dag, name='pop_clean')

compressed >> uncompress >> get_from_folder >> clean

# deaths
Ejemplo n.º 4
0
def get_client():
    return SQLAlchemyClient('sqlite:///data.db')
Ejemplo n.º 5
0
def test_send_more_than_one_command_in_sqlite(code, split_source,
                                              tmp_directory):
    client = SQLAlchemyClient('sqlite:///my_db.db', split_source=split_source)
    client.execute(code)
Ejemplo n.º 6
0
def make(date_):

    date_str = date_.strftime('%Y.%m.%d')
    ROOT = Path('data', date_str)
    ROOT.mkdir(exist_ok=True, parents=True)

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///metadata.db')
    dag.clients[GenericProduct] = client

    loader = SourceLoader(path='.')

    source = 'https://www.inegi.org.mx/contenidos/programas/ccpv/2010/datosabiertos/iter_nal_2010_csv.zip'
    population_zip = DownloadFromURL(source,
                                     File(ROOT / 'population.zip'),
                                     dag,
                                     name='population.zip')
    population = PythonCallable(_uncompress,
                                File(ROOT / 'population'),
                                dag,
                                name='population')
    pop_by_state = PythonCallable(_pop_by_state,
                                  File(ROOT / 'pop_by_state.csv'),
                                  dag,
                                  name='pop_by_state')

    population_zip >> population >> pop_by_state

    confirmed = ShellScript(loader['get_confirmed.sh'], {
        'pdf': File(ROOT / 'confirmed.pdf'),
        'csv': File(ROOT / 'confirmed.csv')
    },
                            dag,
                            params={'date_str': date_str})
    suspected = ShellScript(loader['get_suspected.sh'], {
        'pdf': File(ROOT / 'suspected.pdf'),
        'csv': File(ROOT / 'suspected.csv')
    },
                            dag,
                            params={'date_str': date_str})

    confirmed_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Confirmado)'
    )
    suspected_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s?(Sospechoso)'
    )

    clean_confirmed = PythonCallable(_clean,
                                     File(ROOT / 'confirmed_clean.csv'),
                                     dag,
                                     name='clean_confirmed',
                                     params={'regex': confirmed_regex})

    clean_suspected = PythonCallable(_clean,
                                     File(ROOT / 'suspected_clean.csv'),
                                     dag,
                                     name='clean_suspected',
                                     params={'regex': suspected_regex})

    agg = PythonCallable(_agg,
                         File(ROOT / 'cases_and_population.csv'),
                         dag,
                         name='cases_pop')

    confirmed >> clean_confirmed >> agg
    suspected >> clean_suspected >> agg
    pop_by_state >> agg

    if args.upload:
        upload_confirmed = UploadToS3(
            '{{upstream["clean_confirmed"]}}',
            GenericProduct(
                'mx-health-ministry/{}/confirmed.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_mx_confirmed')
        upload_suspected = UploadToS3(
            '{{upstream["clean_suspected"]}}',
            GenericProduct(
                'mx-health-ministry/{}/suspected.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_mx_suspected')

        clean_confirmed >> upload_confirmed
        clean_suspected >> upload_suspected

        upload_agg = UploadToS3(
            '{{upstream["cases_pop"]}}',
            GenericProduct(
                'mx-health-ministry/{}/cases_pop.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_cases_pop')

        agg >> upload_agg

    return dag
Ejemplo n.º 7
0
def get_client():
    return SQLAlchemyClient('sqlite://')
Ejemplo n.º 8
0
def get_metadata_client():
    return SQLAlchemyClient('sqlite:///metadata.db')
Ejemplo n.º 9
0
def test_creates_absolute_dir_sqlalchemyclient(tmp_directory):
    intermediate_path = Path(tmp_directory, "an/absolute/path")
    client = SQLAlchemyClient(f'sqlite:///{intermediate_path}/my_db.db')
    client.execute('CREATE TABLE my_table (num INT)')
    assert intermediate_path.exists()
Ejemplo n.º 10
0
def test_does_not_create_in_memory_sqlalchemyclient(tmp_directory):
    client = SQLAlchemyClient('sqlite://')
    client.execute('CREATE TABLE my_table (num INT)')
    # Assert no folder/file was created in the temporary folder:
    assert next(Path(tmp_directory).iterdir(), None) is None
Ejemplo n.º 11
0
def test_creates_relative_dir_sqlalchemyclient(tmp_directory):
    intermediate_path = "a/relative/path"
    client = SQLAlchemyClient(f'sqlite:///{intermediate_path}/my_db.db')
    client.execute('CREATE TABLE my_table (num INT)')
    assert Path(tmp_directory + "/" + intermediate_path).exists()
Ejemplo n.º 12
0
def test_pickle_sqlalchemyclient(tmp_directory):
    client = SQLAlchemyClient('sqlite:///my_db.db')
    client.execute('CREATE TABLE my_table (num INT)')
    assert pickle.dumps(client)
Ejemplo n.º 13
0
def test_deepcopy_sqlalchemyclient(tmp_directory):
    client = SQLAlchemyClient('sqlite:///my_db.db')
    client.execute('CREATE TABLE my_table (num INT)')
    assert copy.deepcopy(client)
Ejemplo n.º 14
0
def test_sql_runtime_params(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))

    # make some data and save it in the db
    df = pd.DataFrame({
        'number': range(10),
        'char': list(string.ascii_letters[:10])
    })
    df.to_sql('data', conn)

    # create the task and run it
    dag = DAG(executor=Serial(build_in_subprocess=False))

    t1 = SQLDump('SELECT * FROM data',
                 File('data.parquet'),
                 dag,
                 name='data',
                 client=client,
                 chunksize=None,
                 io_handler=io.ParquetIO)

    def select(product, upstream):
        numbers = list(pd.read_parquet(upstream['data']).number)
        numbers_selected = [n for n in numbers if n % 2 == 0]

        chars = list(pd.read_parquet(upstream['data']).char)
        chars_selected = [repr(c) for i, c in enumerate(chars) if i % 2 == 0]

        Path(product).write_text(
            json.dumps(dict(numbers=numbers_selected, chars=chars_selected)))

    t2 = PythonCallable(select, File('selected.json'), dag, name='selected')

    t3 = SQLDump("""
    SELECT * FROM data WHERE number
    NOT IN ([[get_key(upstream["selected"], "numbers") | join(", ") ]])
""",
                 File('numbers.parquet'),
                 dag,
                 name='numbers',
                 client=client,
                 chunksize=None,
                 io_handler=io.ParquetIO)

    t4 = SQLDump("""
    SELECT * FROM data WHERE char
    NOT IN ([[get_key(upstream["selected"], "chars") | join(", ") ]])
""",
                 File('chars.parquet'),
                 dag,
                 name='chars',
                 client=client,
                 chunksize=None,
                 io_handler=io.ParquetIO)

    t1 >> t2 >> t3
    t2 >> t4

    dag.build()

    assert list(pd.read_parquet('numbers.parquet').number) == [1, 3, 5, 7, 9]
    assert list(
        pd.read_parquet('chars.parquet').char) == ['b', 'd', 'f', 'h', 'j']
Ejemplo n.º 15
0
###############################################################################
# first generate some sample data, one daily observation from 2010 to 2020
dates = pd.date_range('2010', '2020', freq='D')
df = pd.DataFrame({'date': dates, 'x': np.random.rand(len(dates))})

conn = sqlite3.connect(str(path_to_db))
df.to_sql('data', conn)
conn.close()

###############################################################################
# We now build the DAG

dag = DAG()

dag.clients[SQLDump] = SQLAlchemyClient('sqlite:///' + str(path_to_db))


def make_task(date_start, date_end, path, dag):
    """Task factory: returns a task that dumps certain year
    """
    sql = """
    SELECT * FROM data
    WHERE DATE('{{date_start}}') <= date AND date < DATE('{{date_end}}')
    """
    name = f'{date_start}-to-{date_end}.csv'
    return SQLDump(sql,
                   product=File(Path(path / name)),
                   dag=dag,
                   name=f'dump_{name}',
                   params={
Ejemplo n.º 16
0
def get_client():
    return SQLAlchemyClient(get_uri())
Ejemplo n.º 17
0
def make(date_):

    date_str = date_.strftime('%Y.%m.%d')
    ROOT = Path('data', date_str)
    ROOT.mkdir(exist_ok=True, parents=True)

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///metadata.db')
    dag.clients[GenericProduct] = client

    loader = SourceLoader(path='.')

    confirmed = ShellScript(loader['get_confirmed.sh'], {
        'pdf': File(ROOT / 'confirmed.pdf'),
        'csv': File(ROOT / 'confirmed.csv')
    },
                            dag,
                            params={'date_str': date_str})
    suspected = ShellScript(loader['get_suspected.sh'], {
        'pdf': File(ROOT / 'suspected.pdf'),
        'csv': File(ROOT / 'suspected.csv')
    },
                            dag,
                            params={'date_str': date_str})

    confirmed_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+),(.+),(Confirmado)'
    )
    suspected_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Sospechoso)'
    )

    clean_confirmed = PythonCallable(_clean,
                                     File(ROOT / 'confirmed_clean.csv'),
                                     dag,
                                     name='clean_confirmed',
                                     params={'regex': confirmed_regex})

    clean_suspected = PythonCallable(_clean,
                                     File(ROOT / 'suspected_clean.csv'),
                                     dag,
                                     name='clean_suspected',
                                     params={'regex': suspected_regex})

    confirmed >> clean_confirmed
    suspected >> clean_suspected

    if args.upload:
        upload_confirmed = UploadToS3(
            '{{upstream["clean_confirmed"]}}',
            GenericProduct(
                'mx-health-ministry/{}/confirmed.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_mx_confirmed')
        upload_suspected = UploadToS3(
            '{{upstream["clean_suspected"]}}',
            GenericProduct(
                'mx-health-ministry/{}/suspected.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_mx_suspected')

        clean_confirmed >> upload_confirmed
        clean_suspected >> upload_suspected

    return dag
Ejemplo n.º 18
0
###############################################################################
# There are three core concepts in ``ploomber``: :class:`Tasks <ploomber.tasks>`,
# :class:`Products <ploomber.products>` and :class:`DAGs <ploomber.DAG>`. Tasks
# are units of work that generate Products (which are persistent changes on
# disk). Tasks are organized into a DAG which keeps track of declared
# dependencies among them.

dag = DAG()

# the first task dumps data from the db to the local filesystem
task_dump = SQLDump('SELECT * FROM example',
                    File(tmp_dir / 'example.csv'),
                    dag,
                    name='dump',
                    client=SQLAlchemyClient(uri),
                    chunksize=None)


# since this task will have an upstream dependency, it has to accept the
# upstream parameter, all tasks must accept a product parameter
def _add_one(upstream, product):
    """Add one to column a
    """
    df = pd.read_csv(str(upstream['dump']))
    df['a'] = df['a'] + 1
    df.to_csv(str(product), index=False)

# we convert the Python function into a Task
task_add_one = PythonCallable(_add_one,
                              File(tmp_dir / 'add_one.csv'),
Ejemplo n.º 19
0
from pathlib import Path
import tempfile

import pandas as pd

from ploomber.clients import SQLAlchemyClient
from ploomber.tasks import SQLTransfer, SQLScript
from ploomber.products import SQLiteRelation
from ploomber import DAG, SourceLoader

###############################################################################
# We first setup our sample environment, a sqlite database with some data

tmp_dir = Path(tempfile.mkdtemp())
client = SQLAlchemyClient('sqlite:///' + str(tmp_dir / 'my_db.db'))
df = pd.DataFrame({'x': range(10)})
df.to_sql('data', client.engine)

###############################################################################
# We now simulate our code environment: a folder with SQL scripts, for
# simplicity, we are saving them in the same location as the data but in a real
# project we should keep and data separate

_ = Path(tmp_dir, 'data_select.sql').write_text('SELECT * FROM data')

###############################################################################
# Unde the hood SourceLoader initializes a jinja2.Environment which allows us
# to use features such as macros
Path(tmp_dir, 'macros.sql').write_text("""
{% macro my_macro() -%}