Python Flow.process Examples

Programming Language: Python

Namespace/Package Name: dataflows

Class/Type: Flow

Method/Function: process

Examples at hotexamples.com: 31

Python Dataflows is a package library that allows you to process data in a pipeline-like fashion using a declarative syntax. It is designed to help you build and maintain complex data processing flows, and is useful for tasks such as transforming, cleaning, aggregating and visualizing data.

Here are some examples of how you might use Python Dataflows:

1. Data aggregation: Suppose you have a large dataset of customer transactions and you want to aggregate the data by customer, so that you can see the total revenue generated by each customer. You could use Dataflows to group the data by customer ID and sum up the revenue for each group. Here is some example code:

from dataflows import Flow, aggregate

def process_data(rows):
    # group by customer id and sum revenue
    aggregate_fields = {
        'customer_id': ['customer_id'],
        'total_revenue': ('revenue', sum)
    }
    yield from aggregate(rows, aggregate_fields)

with Flow("customer-revenue") as flow:
    flow.transform(process_data)

flow.process()

2. Data cleaning: Suppose you have a dataset of customer information and you want to clean up any data entries that are missing or have incorrect data. You could use Dataflows to filter out any rows with missing data and replace any incorrect data with default values. Here is some example code:

from dataflows import Flow, validate, update_schema

def process_data(rows):
    # define validation and schema updates
    schema_updates = {
        'name': {'missingValues': ['']},
        'age': {'type': 'integer', 'default': 0},
        'email': {'missingValues': [''], 'default': '[email protected]'}
    }
    validation = {'name': {'type': 'string', 'minLength': 3}}
    yield from validate(rows, schema_updates=schema_updates, validation=validation)

with Flow("customer-cleaning") as flow:
    flow.transform(process_data)

flow.process()

These examples illustrate the usefulness of Python Dataflows for processing data in a structured and scalable way. By using a declarative syntax, you can easily define complex data processing flows and maintain them over time as your data changes.

Python Flow.process - 31 examples found. These are the top rated real world Python examples of dataflows.Flow.process extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

process(30)

results(30)

Flow(29)

datastream(1)

get(1)

Example #1

Show file

def test_change_acl_on_s3_no_path_provided(s3_client, bucket):

    # Prepare paths
    paths = [
        'my/private/datasets/file_1.csv'
        'my/private/datasets/file_2.csv'
    ]

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert everything is private now
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 403

Example #2

Show file

File: helpers.py Project: saeelparsekar/emojis

def generate_package():
    package_flow = Flow(
        add_metadata(
            name="unicode-emojis",
            title="UTS #51 Unicode Emoji",
            descriptor=(
                "List of emojis available from the Unicode Consortium. "
                "More information can be found in the Unicode® Technical Standard #51."
            ),
            sources=[
                {
                    "name": "unicode-emoji",
                    "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt",
                    "title": "UTS #51 Unicode Emoji",
                },
            ],
            licenses=[
                {
                    "name": "ODC-PDDL-1.0",
                    "path": "http://opendatacommons.org/licenses/pddl/",
                    "title": "Open Data Commons Public Domain Dedication and License v1.0",
                }
            ],
            keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"],
        ),
        load(load_source="data/emojis.csv", format="csv",),
        validate(),
        dump_to_path(),
    )
    package_flow.process()

Example #3

Show file

def test_dump_to_s3_non_existent_bucket(s3_client, bucket):

    # Delete bucket
    s3_client.delete_bucket(Bucket=bucket)

    # Dump to S3 using the processor
    flow = Flow(
        load('data/data.csv'),
        dump_to_s3(
            bucket=bucket,
            acl='private',
            path='my/datapackage',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Check datapackage.json content
    response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/datapackage.json')
    descriptor = json.loads(response['Body'].read().decode('utf-8'))
    assert descriptor['resources'][0]['schema']['fields'][0]['name'] == 'id'
    assert descriptor['resources'][0]['schema']['fields'][1]['name'] == 'name'

    # Check data.csv content
    response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/data.csv')
    contents = response['Body'].read().decode('utf-8')
    assert contents == 'id,name\r\n1,english\r\n2,中国人\r\n'

Example #4

Show file

def test_change_acl_on_s3_handles_more_than_1000_files(s3_client, bucket):

    # Prepare paths
    paths = []
    for index in range(1, 1101):
        path = 'my/private/datasets/file_%s.csv' % index
        paths.append(path)

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/private/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert everything is private now
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 403

Example #5

Show file

def test_change_acl_on_s3(s3_client, bucket):

    # Prepare paths
    paths = [
        'my/private/datasets/README.md',
        'my/private/datasets/datapackage.json',
        'my/private/datasets/data/mydata.csv',
        'my/public/datasets/data/mydata.csv',
    ]

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Assert all contents are public by default
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 200

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/private/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert only public contents are public
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == (200 if 'public' in path else 403)

Example #6

Show file

File: update.py Project: loleg/opendatach-stats

def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()

Example #7

Show file

File: flow.py Project: ColinMaudry/decp-table-schema-utils

def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()

Example #8

Show file

def AFRR_Data():
    unpivoting_fields = [{
        'name': 'aFRR_DownActivated',
        'keys': {
            'product': 'aFRR_DownActivated'
        }
    }, {
        'name': 'aFRR_UpActivated',
        'keys': {
            'product': 'aFRR_UpActivated'
        }
    }]
    extra_keys = [{'name': 'product', 'type': 'string'}]
    extra_value = {'name': 'amount', 'type': 'number'}
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000',
            format="json",
            property="result.records",
            name="fact_afrr"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('afrr'),
        # Normalize/unpivot:
        unpivot(unpivoting_fields, extra_keys, extra_value),
        add_computed_field([
            dict(target=dict(name='PriceArea', type='string'),
                 operation='constant',
                 with_='DK1'),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_='dummy'),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_='dummy')
        ]),
        add_price,
        delete_fields(fields=[
            'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK',
            'aFRR_UpPriceEUR'
        ]),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_afrr',
            title='Automatic Frequency Restoration Reserves',
            source=
            'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c'
        ),
        printer(),
        dump_to_path('afrr_data'))
    flow.process()

Example #9

Show file

File: test_examples.py Project: nipoitra80/dataflows

def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())

    with pytest.raises(exceptions.ProcessorError) as excinfo:
        f.process()
    assert isinstance(excinfo.value.cause, ValidationError)

Example #10

Show file

def test_change_acl_on_s3_handles_non_existing_keys(s3_client, bucket):

    # Set private ACL using the processor
    # Assert not failing (does nothing)
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/non-existing/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

Example #11

Show file

def update_dataset():
    flow = Flow(
        # Load inputs
        load(f'{BASE_URL}{CONFIRMED}'),
        load(f'{BASE_URL}{RECOVERED}'),
        load(f'{BASE_URL}{DEATH}'),
        checkpoint('load_data'),
        # Process them (if necessary)
        # Save the results
        add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''),
        printer(),
        dump_to_path(),
    )
    flow.process()

Example #12

Show file

def test_dump_to_sql():
    from dataflows import Flow, printer, dump_to_sql
    from sqlalchemy import create_engine

    f = Flow(
        data, printer(),
        dump_to_sql(dict(output_table={'resource-name': 'res_1'}),
                    engine='sqlite:///out/test.db'))
    f.process()

    # Check validity
    engine = create_engine('sqlite:///out/test.db')
    result = list(
        dict(x) for x in engine.execute('select * from output_table'))
    assert result == data

Example #13

Show file

File: test_examples.py Project: vitaly-am/dataflows

def test_example_5():
    from dataflows import Flow, set_type, dump_to_path

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','),
             dump_to_path('out/country_population'))
    _ = f.process()

Example #14

Show file

File: test_examples.py Project: fahadashrafi/dataflows

def test_example_75():
    from dataflows import Flow, load, dump_to_path


    def add_is_guitarist_column_to_schema(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict(
            name='is_guitarist',
            type='boolean'
        ))
        # Must yield the modified datapackage
        yield package.pkg
        yield from package

    def add_is_guitarist_column(row):
        row['is_guitarist'] = row['instrument'] == 'guitar'
        return row

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column_to_schema,
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists2')
    )
    _ = f.process()

Example #15

Show file

File: test_examples.py Project: vitaly-am/dataflows

def test_example_8():
    from dataflows import Flow, load, dump_to_path

    def find_double_winners(package):

        # Remove the emmies resource - we're going to consume it now
        package.pkg.remove_resource('emmies')
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)

        # Emmies is the first - read all its data and create a set of winner names
        emmy = next(resources)
        emmy_winners = set(
            map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy)))

        # Oscars are next - filter rows based on the emmy winner set
        academy = next(resources)
        yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners,
                     academy)

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        find_double_winners,
        dump_to_path('out/double_winners'))
    _ = f.process()

Example #16

Show file

File: test_examples.py Project: vitaly-am/dataflows

def test_example_7():
    from dataflows import Flow, load, dump_to_path

    def add_is_guitarist_column(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(
            dict(name='is_guitarist', type='boolean'))
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)
        beatles = next(resources)

        def f(row):
            row['is_guitarist'] = row['instrument'] == 'guitar'
            return row

        yield map(f, beatles)

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists'))
    _ = f.process()

Example #17

Show file

def main():
    config = Config(sys.argv[1] if len(sys.argv) > 1 else 'dgp.yaml')
    taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
    context = Context(config, taxonomy_registry)

    dgp = SimpleDGP(config, context)
    ret = dgp.analyze()
    if not ret:
        print('Errors:', '\n\t - '.join([str(x) for x in dgp.errors]))
        sys.exit(0)

    flow = dgp.flow()
    flow = Flow(flow, dump_to_path('output'))
    flow.process()

    print('----')
    print('Success:', ret)

Example #18

Show file

File: update.py Project: luc-gr/panoptikum-staging

def WERKVERZEICHNIS_csv():
    flow = Flow(
        # Load inputs
        # load('input/WERKVERZEICHNIS_ohne W.xlsx', format='xlsx', ),
        load(
            'input/WERKVERZEICHNIS.csv',
            format='csv',
        ),
        # Process them (if necessary)
        # ...
        # Save the results
        add_metadata(name='Werkverzeichnis-JStraumann',
                     title='''Werkverzeichnis Jürg Straumann'''),
        # printer(),
        dump_to_path('data'),
    )
    flow.process()

Example #19

Show file

def operator(name, params, pipeline):
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file:
        params['dgpConfig'].setdefault('publish', {})['allowed'] = True
        metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {})
        metadata['title'] = name
        metadata['dag_id'] = pipeline['id']
        metadata['updated_at'] = pipeline['__updated_at']
        metadata['created_at'] = pipeline['__created_at']
        for k, v in params.items():
            if k.startswith('extra.'):
                set_dots(params['dgpConfig'], k, v)
        logging.info('\nCONFIGURATION:\n--------------\n%s', 
                     json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2))
        yaml.dump(params['dgpConfig'], config_file)
        config_file.flush()
        config = Config(config_file.name)
        taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
        context = Context(config, taxonomy_registry)

        logging.getLogger().setLevel(logging.INFO)

        steps = [
            FileLoaderDGP,
            LoaderDGP,
            PostLoaderDGP,
            TransformDGP,
            EnricherDGP,
            PublisherDGP,
        ]

        dgp = SimpleDGP(
            config, context,
            steps=steps
        )

        ret = dgp.analyze()
        if not ret:
            logging.error('Errors:')
            logging.error('\n\t - '.join([str(x) for x in dgp.errors]))
            assert False

        # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', 
        #              json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2))

        logging.info('Creating Flow')
        flow = dgp.flow()
        flow = Flow(
            flow,
            printer(tablefmt='html')
        )
        logging.info('Running Flow')
        _, stats = flow.process()

        logging.info('Success')

        return stats

Example #20

Show file

File: elspot.py Project: anuveyatsu/marketdata-flows

def Elspot_Prices_Data():
    # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d')
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100',
            format="json",
            property="result.records",
            name="fact_elspot_prices"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('load_data'),
        # Add product:
        add_computed_field([
            dict(target=dict(name='product', type='string'),
                 operation='constant',
                 with_='Elspot'),
            dict(target=dict(name='amount', type='number'),
                 operation='constant',
                 with_=1),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_=-1),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_=-1)
        ]),
        add_price,
        delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_elspot_prices',
            title='Elspot Prices Data',
            source=
            'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d'
        ),
        printer(),
        dump_to_path('elspot_prices_data'),
        # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes')
    )
    flow.process()

Example #21

Show file

File: test_dumper.py Project: NYCPlanning/db-data-recipes

def test_dump_to_postgis():
    from dataflows import Flow, load
    from lib import dump_to_postgis
    from sqlalchemy import create_engine
    import os 
    import sys
    
    table_name = 'nycha_policeservice'
    url = 'https://data.cityofnewyork.us/api/views/bvi6-r9nk/rows.csv?accessType=DOWNLOAD'
    f = Flow(
        load(url, name=table_name, format='csv', force_strings=True),
        dump_to_postgis(engine='env://DATAFLOWS_DB_ENGINE')
        )
    
    f.process()

    engine = create_engine(os.environ.get('DATAFLOWS_DB_ENGINE'))
    result = list(dict(x) for x in engine.execute(f'select wkb_geometry from {table_name} limit 1'))
    print(result)
    assert result == [{'wkb_geometry': '0101000020E61000001FCF149C617F52C0D7CE5B5AD9494440'}]

Example #22

Show file

File: test_examples.py Project: vitaly-am/dataflows

def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())
    try:
        _ = f.process()
        assert False
    except ValidationError:
        pass

Example #23

Show file

File: test_examples.py Project: vitaly-am/dataflows

def test_example_6():
    from dataflows import Flow, set_type, dump_to_path

    def all_triplets():
        for a in range(1, 21):
            for b in range(a, 21):
                for c in range(b + 1, 21):
                    yield dict(a=a, b=b, c=c)

    def filter_pythagorean_triplets(rows):
        for row in rows:
            if row['a']**2 + row['b']**2 == row['c']**2:
                yield row

    f = Flow(all_triplets(), set_type('a', type='integer'),
             set_type('b', type='integer'), set_type('c', type='integer'),
             filter_pythagorean_triplets,
             dump_to_path('out/pythagorean_triplets'))
    _ = f.process()

Example #24

Show file

File: test_examples.py Project: vitaly-am/dataflows

def test_example_9():
    from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        filter_rows(equals=[dict(winner=1)]),
        concatenate(dict(emmy_nominee=['nominee'], ),
                    dict(name='emmies_filtered'),
                    resources='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        join(
            'emmies_filtered',
            ['emmy_nominee'],  # Source resource
            'oscars',
            ['Name'],  # Target resource
            full=False  # Don't add new fields, remove unmatched rows
        ),
        filter_rows(equals=[dict(Winner='1')]),
        dump_to_path('out/double_winners'))
    _ = f.process()

Example #25

Show file

    set_type(
        'Rate',
        resources='quarterly',
        type='number',
        description=
        'Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'
    ), set_type('Year', resources='annual', type='date', format='any'),
    set_type(
        'Rate',
        resources='annual',
        type='number',
        description=
        'Annual average yield from British Government Securities, 10 year Nominal Par Yield'
    ),
    update_resource('quarterly', **{
        'path': 'data/quarterly.csv',
        'dpp:streaming': True
    }),
    update_resource('annual', **{
        'path': 'data/annual.csv',
        'dpp:streaming': True
    }), validate(), dump_to_path())


def flow(parameters, datapackage, resources, stats):
    return bond_uk


if __name__ == '__main__':
    bond_uk.process()

Example #26

Show file

File: bond_uk_flow.py Project: datasets/bond-yields-uk-10y

    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Date', 'Rate'],
        format='csv',
        name='quarterly'
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Year', 'Rate'],
        format='csv',
        name='annual'
    ),
    set_type('Date', resources='quarterly', type='date', format='any'),
    set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'),
    set_type('Year', resources='annual', type='date', format='any'),
    set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'),
    update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)

def flow(parameters, datapackage, resources, stats):
    return bond_uk

if __name__ == '__main__':
    bond_uk.process()

Example #27

Show file

File: finance_vix_flow.py Project: vskynet/finance-vix

            }
        ],
        version="0.2.0",
        views=[
            {
              "name": "graph",
              "title": "VIX - CBOE Volatility Index",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["VIX Close"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv',
        headers=2,
        name='vix-daily'
    ),
    set_type('Date', type='date', format='any'),
    update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}),
    validate()
)


def flow(parameters, datapackage, resources, stats):
    return finance_vix


if __name__ == '__main__':
    finance_vix.process()

Example #28

Show file

File: opec.py Project: gustavhempel/ETL-Pipeline

            ' ', '_', 'Country', '2017', '2018', '2019', '-', 'Q417', '1Q18',
            '2Q18', '3Q18', '4Q18', '1Q19', '2Q19', '3Q19', '4Q19'
        ]),
    load(
        load_source=
        'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx',
        format='xls',
        sheet=5,
        skip_rows=[1, 2, 3, 4, 5, 6],
        headers=[
            ' ', 'Country', '2016', '2017', '2018', '3Q19', '4Q19', '2019',
            'Change 19/18', '1Q20', '2Q20', '3Q20', '4Q20', 'Change 20/19'
        ]),
    load(
        load_source=
        'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx',
        format='xls',
        sheet=6,
        skip_rows=[1, 2, 3, 4, 5, 6],
        headers=[
            ' ', 'Country', '2017', '2018', '2019', 'Change 19/18', '2Q19',
            '3Q19', '4Q19', '1Q20', 'Feb20', 'Mar20', 'Change Mar/Feb'
        ]),
    rename_resources,
    validate(),
    printer(),
    dump_to_path('opec'),
)

oil_prices.process()

Example #29

Show file

File: bond_us_flow.py Project: fagan2888/bond-yields-us-10y

            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)


def flow(parameters, datapackage, resources, stats):
    return bond_us


if __name__ == '__main__':
    bond_us.process()

Example #30

Show file

        "with": "{Region, subregion, country or area *}"
        },
        {
        "operation": "format",
        "target": "Country Code",
        "with": "{Country code}"
        },
        {
        "operation": "format",
        "target": "Year",
        "with": "{year}"
        },
        {
        "operation": "format",
        "target": "Population",
        "with": "{population}"
        }
    ]),
    delete_fields(fields=[
        'Type', 'Parent code', 'Region, subregion, country or area *', 'Country code', 'year', 'population'
    ], regex=False),
    validate(),
    dump_to_path()
)

def flow(parameters, datapackage, resources, stats):
    return population_estimates

if __name__ == '__main__':
    population_estimates.process()

Example #31

Show file

        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='annual'
    ),
    extract_december_rows,
    load(
        load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='monthly'
    ),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    set_type('Date', resources='annual', type='yearmonth'),
    set_type('Price', resources='annual', type='number'),
    set_type('Date', resources='monthly', type='yearmonth'),
    set_type('Price', resources='monthly', type='number'),
    validate(),
    delete_fields(['Empty column'], resources=None)
)


def flow(parameters, datapackage, resources, stats):
    return gold_price_flow


if __name__ == '__main__':
    gold_price_flow.process()