Ejemplo n.º 1
0
def test_sample_dag(sqlite_client_and_tmp_dir, class_, identifier):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()
    product = GenericProduct('some_file.txt', client=client)
    PythonCallable(touch, product, dag)
    dag.build()

    assert Path('some_file.txt').exists()
    assert product.exists()
    assert product.fetch_metadata() is not None
Ejemplo n.º 2
0
def test_delete_sqlite_backend(sqlite_client_and_tmp_dir, class_, identifier):
    client, tmp_dir = sqlite_client_and_tmp_dir
    product = GenericProduct('some_identifier.txt', client=client)
    m = {'metadata': 'value'}
    product.save_metadata(m)
    product.delete()

    assert not product.exists()
Ejemplo n.º 3
0
def test_upload_to_s3(s3, tmp_directory):
    dag = DAG()
    dag.clients[GenericProduct] = SQLAlchemyClient('sqlite://')
    Path('somefile.txt').touch()
    UploadToS3('somefile.txt',
               GenericProduct('somefile-in-s3.txt'),
               dag,
               bucket='some-bucket',
               name='s3_upload')

    dag.build()

    contents = s3.list_objects(Bucket='some-bucket')['Contents']

    assert len(contents) == 1
    assert contents[0]['Key'] == 'somefile-in-s3.txt'
def make(date_):

    date_str = date_.strftime('%Y.%m.%d')
    ROOT = Path('data', date_str)
    ROOT.mkdir(exist_ok=True, parents=True)

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///metadata.db')
    dag.clients[GenericProduct] = client

    loader = SourceLoader(path='.')

    source = 'https://www.inegi.org.mx/contenidos/programas/ccpv/2010/datosabiertos/iter_nal_2010_csv.zip'
    population_zip = DownloadFromURL(source, File(ROOT / 'population.zip'),
                                     dag, name='population.zip')
    population = PythonCallable(_uncompress, File(ROOT / 'population'),
                                dag, name='population')
    pop_by_state = PythonCallable(_pop_by_state,
                                  File(ROOT / 'pop_by_state.csv'),
                                  dag, name='pop_by_state')

    population_zip >> population >> pop_by_state

    confirmed = ShellScript(loader['get_confirmed.sh'],
                            {'pdf': File(ROOT / 'confirmed.pdf'),
                             'csv': File(ROOT / 'confirmed.csv')},
                            dag,
                            params={'date_str': date_str})
    suspected = ShellScript(loader['get_suspected.sh'],
                            {'pdf': File(ROOT / 'suspected.pdf'),
                             'csv': File(ROOT / 'suspected.csv')},
                            dag,
                            params={'date_str': date_str})

    confirmed_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Confirmado)')
    suspected_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s?(Sospechoso)')

    clean_confirmed = PythonCallable(_clean,
                                     File(ROOT / 'confirmed_clean.csv'),
                                     dag,
                                     name='clean_confirmed',
                                     params={'regex': confirmed_regex})

    clean_suspected = PythonCallable(_clean,
                                     File(ROOT / 'suspected_clean.csv'),
                                     dag,
                                     name='clean_suspected',
                                     params={'regex': suspected_regex})

    agg = PythonCallable(_agg,
                         File(ROOT / 'cases_and_population.csv'),
                         dag,
                         name='cases_pop')

    confirmed >> clean_confirmed >> agg
    suspected >> clean_suspected >> agg
    pop_by_state >> agg

    if args.upload:
        upload_confirmed = UploadToS3('{{upstream["clean_confirmed"]}}',
                                      GenericProduct(
                                          'mx-health-ministry/{}/confirmed.csv'.format(date_str)), dag,
                                      bucket='mx-covid-data',
                                      name='upload_mx_confirmed')
        upload_suspected = UploadToS3('{{upstream["clean_suspected"]}}',
                                      GenericProduct(
                                          'mx-health-ministry/{}/suspected.csv'.format(date_str)),
                                      dag,
                                      bucket='mx-covid-data',
                                      name='upload_mx_suspected')

        clean_confirmed >> upload_confirmed
        clean_suspected >> upload_suspected

        upload_agg = UploadToS3('{{upstream["cases_pop"]}}',
                                GenericProduct(
                                    'mx-health-ministry/{}/cases_pop.csv'.format(date_str)),
                                dag,
                                bucket='mx-covid-data',
                                name='upload_cases_pop')

        agg >> upload_agg

    return dag
Ejemplo n.º 5
0
             kernelspec_name=None,
             static_analysis=False,
             kwargs={})
    ],
    [
        SQLScript,
        dict(source='CREATE TABLE {{product}} FROM some_table', kwargs={})
    ],
    [SQLDump, dict(source='SELECT * FROM some_tablle', kwargs={})],
])
def test_init_source(class_, kwargs):
    assert class_._init_source(**kwargs)


@pytest.mark.parametrize('Task, prod, source', [
    (ShellScript, GenericProduct('file.txt'), 'touch {{product}}'),
    (SQLScript, GenericSQLRelation(
        ('name', 'table')), 'CREATE TABLE {{product}}'),
    (SQLDump, GenericProduct('file.txt'), 'SELECT * FROM {{upstream["key"]}}'),
    (SQLTransfer, GenericSQLRelation(
        ('name', 'table')), 'SELECT * FROM {{upstream["key"]}}'),
    (SQLUpload, GenericSQLRelation(('name', 'table')), 'some_file.txt'),
    (PostgresCopyFrom, PostgresRelation(('name', 'table')), 'file.parquet')
])
def test_task_init_source_with_placeholder_obj(Task, prod, source):
    """
    Testing we can initialize a task with a Placeholder as the source argument
    """
    dag = DAG()
    dag.clients[Task] = Mock()
    dag.clients[type(prod)] = Mock()
Ejemplo n.º 6
0
                              name='deaths_clean')

deaths >> clean_deaths


def _mortality_rate(upstream, product):
    pop = pd.read_csv(str(upstream['pop_clean']))
    deaths = pd.read_csv(str(upstream['deaths_clean']))

    df = deaths.merge(pop, on='country', how='left')
    df['deaths_over_100k'] = df.deaths / (df.population / 100_000)
    df.to_csv(str(product), index=False)


rate = PythonCallable(_mortality_rate, File(ROOT / 'mortality_rate.csv'), dag,
                      name='mortality_rate')

(clean_deaths + clean) >> rate

if args.upload:
    upload = UploadToS3('{{upstream["mortality_rate"]}}',
                        GenericProduct('mortality_rate_s3'), dag,
                        bucket='mx-covid-data',
                        name='upload_s3')

    rate >> upload

table = dag.build()

print(table)
Ejemplo n.º 7
0
def make(date_):

    date_str = date_.strftime('%Y.%m.%d')
    ROOT = Path('data', date_str)
    ROOT.mkdir(exist_ok=True, parents=True)

    dag = DAG()

    client = SQLAlchemyClient('sqlite:///metadata.db')
    dag.clients[GenericProduct] = client

    loader = SourceLoader(path='.')

    confirmed = ShellScript(loader['get_confirmed.sh'], {
        'pdf': File(ROOT / 'confirmed.pdf'),
        'csv': File(ROOT / 'confirmed.csv')
    },
                            dag,
                            params={'date_str': date_str})
    suspected = ShellScript(loader['get_suspected.sh'], {
        'pdf': File(ROOT / 'suspected.pdf'),
        'csv': File(ROOT / 'suspected.csv')
    },
                            dag,
                            params={'date_str': date_str})

    confirmed_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+),(.+),(Confirmado)'
    )
    suspected_regex = re.compile(
        r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Sospechoso)'
    )

    clean_confirmed = PythonCallable(_clean,
                                     File(ROOT / 'confirmed_clean.csv'),
                                     dag,
                                     name='clean_confirmed',
                                     params={'regex': confirmed_regex})

    clean_suspected = PythonCallable(_clean,
                                     File(ROOT / 'suspected_clean.csv'),
                                     dag,
                                     name='clean_suspected',
                                     params={'regex': suspected_regex})

    confirmed >> clean_confirmed
    suspected >> clean_suspected

    if args.upload:
        upload_confirmed = UploadToS3(
            '{{upstream["clean_confirmed"]}}',
            GenericProduct(
                'mx-health-ministry/{}/confirmed.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_mx_confirmed')
        upload_suspected = UploadToS3(
            '{{upstream["clean_suspected"]}}',
            GenericProduct(
                'mx-health-ministry/{}/suspected.csv'.format(date_str)),
            dag,
            bucket='mx-covid-data',
            name='upload_mx_suspected')

        clean_confirmed >> upload_confirmed
        clean_suspected >> upload_suspected

    return dag