コード例 #1
0
def test_change_acl_on_s3_no_path_provided(s3_client, bucket):

    # Prepare paths
    paths = [
        'my/private/datasets/file_1.csv'
        'my/private/datasets/file_2.csv'
    ]

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert everything is private now
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 403
コード例 #2
0
ファイル: helpers.py プロジェクト: saeelparsekar/emojis
def generate_package():
    package_flow = Flow(
        add_metadata(
            name="unicode-emojis",
            title="UTS #51 Unicode Emoji",
            descriptor=(
                "List of emojis available from the Unicode Consortium. "
                "More information can be found in the Unicode® Technical Standard #51."
            ),
            sources=[
                {
                    "name": "unicode-emoji",
                    "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt",
                    "title": "UTS #51 Unicode Emoji",
                },
            ],
            licenses=[
                {
                    "name": "ODC-PDDL-1.0",
                    "path": "http://opendatacommons.org/licenses/pddl/",
                    "title": "Open Data Commons Public Domain Dedication and License v1.0",
                }
            ],
            keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"],
        ),
        load(load_source="data/emojis.csv", format="csv",),
        validate(),
        dump_to_path(),
    )
    package_flow.process()
コード例 #3
0
def test_dump_to_s3_non_existent_bucket(s3_client, bucket):

    # Delete bucket
    s3_client.delete_bucket(Bucket=bucket)

    # Dump to S3 using the processor
    flow = Flow(
        load('data/data.csv'),
        dump_to_s3(
            bucket=bucket,
            acl='private',
            path='my/datapackage',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Check datapackage.json content
    response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/datapackage.json')
    descriptor = json.loads(response['Body'].read().decode('utf-8'))
    assert descriptor['resources'][0]['schema']['fields'][0]['name'] == 'id'
    assert descriptor['resources'][0]['schema']['fields'][1]['name'] == 'name'

    # Check data.csv content
    response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/data.csv')
    contents = response['Body'].read().decode('utf-8')
    assert contents == 'id,name\r\n1,english\r\n2,中国人\r\n'
コード例 #4
0
def test_change_acl_on_s3_handles_more_than_1000_files(s3_client, bucket):

    # Prepare paths
    paths = []
    for index in range(1, 1101):
        path = 'my/private/datasets/file_%s.csv' % index
        paths.append(path)

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/private/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert everything is private now
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 403
コード例 #5
0
def test_change_acl_on_s3(s3_client, bucket):

    # Prepare paths
    paths = [
        'my/private/datasets/README.md',
        'my/private/datasets/datapackage.json',
        'my/private/datasets/data/mydata.csv',
        'my/public/datasets/data/mydata.csv',
    ]

    # Fill the S3 bucket
    for path in paths:
        s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read')

    # Assert all contents are public by default
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == 200

    # Set private ACL using the processor
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/private/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()

    # Assert only public contents are public
    for path in paths:
        url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path)
        assert requests.get(url).status_code == (200 if 'public' in path else 403)
コード例 #6
0
ファイル: update.py プロジェクト: loleg/opendatach-stats
def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()
コード例 #7
0
def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()
コード例 #8
0
def AFRR_Data():
    unpivoting_fields = [{
        'name': 'aFRR_DownActivated',
        'keys': {
            'product': 'aFRR_DownActivated'
        }
    }, {
        'name': 'aFRR_UpActivated',
        'keys': {
            'product': 'aFRR_UpActivated'
        }
    }]
    extra_keys = [{'name': 'product', 'type': 'string'}]
    extra_value = {'name': 'amount', 'type': 'number'}
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000',
            format="json",
            property="result.records",
            name="fact_afrr"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('afrr'),
        # Normalize/unpivot:
        unpivot(unpivoting_fields, extra_keys, extra_value),
        add_computed_field([
            dict(target=dict(name='PriceArea', type='string'),
                 operation='constant',
                 with_='DK1'),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_='dummy'),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_='dummy')
        ]),
        add_price,
        delete_fields(fields=[
            'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK',
            'aFRR_UpPriceEUR'
        ]),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_afrr',
            title='Automatic Frequency Restoration Reserves',
            source=
            'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c'
        ),
        printer(),
        dump_to_path('afrr_data'))
    flow.process()
コード例 #9
0
ファイル: test_examples.py プロジェクト: nipoitra80/dataflows
def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError, exceptions

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())

    with pytest.raises(exceptions.ProcessorError) as excinfo:
        f.process()
    assert isinstance(excinfo.value.cause, ValidationError)
コード例 #10
0
def test_change_acl_on_s3_handles_non_existing_keys(s3_client, bucket):

    # Set private ACL using the processor
    # Assert not failing (does nothing)
    flow = Flow(
        load('data/data.csv'),
        change_acl_on_s3(
            bucket=bucket,
            acl='private',
            path='my/non-existing/datasets',
            endpoint_url=os.environ['S3_ENDPOINT_URL'],
        ),
    )
    flow.process()
コード例 #11
0
def update_dataset():
    flow = Flow(
        # Load inputs
        load(f'{BASE_URL}{CONFIRMED}'),
        load(f'{BASE_URL}{RECOVERED}'),
        load(f'{BASE_URL}{DEATH}'),
        checkpoint('load_data'),
        # Process them (if necessary)
        # Save the results
        add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''),
        printer(),
        dump_to_path(),
    )
    flow.process()
コード例 #12
0
def test_dump_to_sql():
    from dataflows import Flow, printer, dump_to_sql
    from sqlalchemy import create_engine

    f = Flow(
        data, printer(),
        dump_to_sql(dict(output_table={'resource-name': 'res_1'}),
                    engine='sqlite:///out/test.db'))
    f.process()

    # Check validity
    engine = create_engine('sqlite:///out/test.db')
    result = list(
        dict(x) for x in engine.execute('select * from output_table'))
    assert result == data
コード例 #13
0
ファイル: test_examples.py プロジェクト: vitaly-am/dataflows
def test_example_5():
    from dataflows import Flow, set_type, dump_to_path

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','),
             dump_to_path('out/country_population'))
    _ = f.process()
コード例 #14
0
def test_example_75():
    from dataflows import Flow, load, dump_to_path


    def add_is_guitarist_column_to_schema(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict(
            name='is_guitarist',
            type='boolean'
        ))
        # Must yield the modified datapackage
        yield package.pkg
        yield from package

    def add_is_guitarist_column(row):
        row['is_guitarist'] = row['instrument'] == 'guitar'
        return row

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column_to_schema,
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists2')
    )
    _ = f.process()
コード例 #15
0
ファイル: test_examples.py プロジェクト: vitaly-am/dataflows
def test_example_8():
    from dataflows import Flow, load, dump_to_path

    def find_double_winners(package):

        # Remove the emmies resource - we're going to consume it now
        package.pkg.remove_resource('emmies')
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)

        # Emmies is the first - read all its data and create a set of winner names
        emmy = next(resources)
        emmy_winners = set(
            map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy)))

        # Oscars are next - filter rows based on the emmy winner set
        academy = next(resources)
        yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners,
                     academy)

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        find_double_winners,
        dump_to_path('out/double_winners'))
    _ = f.process()
コード例 #16
0
ファイル: test_examples.py プロジェクト: vitaly-am/dataflows
def test_example_7():
    from dataflows import Flow, load, dump_to_path

    def add_is_guitarist_column(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(
            dict(name='is_guitarist', type='boolean'))
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)
        beatles = next(resources)

        def f(row):
            row['is_guitarist'] = row['instrument'] == 'guitar'
            return row

        yield map(f, beatles)

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists'))
    _ = f.process()
コード例 #17
0
def main():
    config = Config(sys.argv[1] if len(sys.argv) > 1 else 'dgp.yaml')
    taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
    context = Context(config, taxonomy_registry)

    dgp = SimpleDGP(config, context)
    ret = dgp.analyze()
    if not ret:
        print('Errors:', '\n\t - '.join([str(x) for x in dgp.errors]))
        sys.exit(0)

    flow = dgp.flow()
    flow = Flow(flow, dump_to_path('output'))
    flow.process()

    print('----')
    print('Success:', ret)
コード例 #18
0
ファイル: update.py プロジェクト: luc-gr/panoptikum-staging
def WERKVERZEICHNIS_csv():
    flow = Flow(
        # Load inputs
        # load('input/WERKVERZEICHNIS_ohne W.xlsx', format='xlsx', ),
        load(
            'input/WERKVERZEICHNIS.csv',
            format='csv',
        ),
        # Process them (if necessary)
        # ...
        # Save the results
        add_metadata(name='Werkverzeichnis-JStraumann',
                     title='''Werkverzeichnis Jürg Straumann'''),
        # printer(),
        dump_to_path('data'),
    )
    flow.process()
コード例 #19
0
def operator(name, params, pipeline):
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file:
        params['dgpConfig'].setdefault('publish', {})['allowed'] = True
        metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {})
        metadata['title'] = name
        metadata['dag_id'] = pipeline['id']
        metadata['updated_at'] = pipeline['__updated_at']
        metadata['created_at'] = pipeline['__created_at']
        for k, v in params.items():
            if k.startswith('extra.'):
                set_dots(params['dgpConfig'], k, v)
        logging.info('\nCONFIGURATION:\n--------------\n%s', 
                     json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2))
        yaml.dump(params['dgpConfig'], config_file)
        config_file.flush()
        config = Config(config_file.name)
        taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
        context = Context(config, taxonomy_registry)

        logging.getLogger().setLevel(logging.INFO)

        steps = [
            FileLoaderDGP,
            LoaderDGP,
            PostLoaderDGP,
            TransformDGP,
            EnricherDGP,
            PublisherDGP,
        ]

        dgp = SimpleDGP(
            config, context,
            steps=steps
        )

        ret = dgp.analyze()
        if not ret:
            logging.error('Errors:')
            logging.error('\n\t - '.join([str(x) for x in dgp.errors]))
            assert False

        # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', 
        #              json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2))

        logging.info('Creating Flow')
        flow = dgp.flow()
        flow = Flow(
            flow,
            printer(tablefmt='html')
        )
        logging.info('Running Flow')
        _, stats = flow.process()

        logging.info('Success')

        return stats
コード例 #20
0
ファイル: elspot.py プロジェクト: anuveyatsu/marketdata-flows
def Elspot_Prices_Data():
    # field_metadata = get_metadata('c86859d2-942e-4029-aec1-32d56f1a2e5d')
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20elspotprices%20order%20by%20"HourUTC"%20desc%20limit%20100',
            format="json",
            property="result.records",
            name="fact_elspot_prices"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('load_data'),
        # Add product:
        add_computed_field([
            dict(target=dict(name='product', type='string'),
                 operation='constant',
                 with_='Elspot'),
            dict(target=dict(name='amount', type='number'),
                 operation='constant',
                 with_=1),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_=-1),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_=-1)
        ]),
        add_price,
        delete_fields(fields=['SpotPriceDKK', 'SpotPriceEUR']),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_elspot_prices',
            title='Elspot Prices Data',
            source=
            'https://www.energidataservice.dk/dataset/elspotprices/resource_extract/c86859d2-942e-4029-aec1-32d56f1a2e5d'
        ),
        printer(),
        dump_to_path('elspot_prices_data'),
        # dump_to_sql(tables={'elspot': {'resource-name': 'Elspot_Prices_Data', 'mode': 'append'}}, engine='postgresql://*****:*****@localhost/cubes')
    )
    flow.process()
コード例 #21
0
def test_dump_to_postgis():
    from dataflows import Flow, load
    from lib import dump_to_postgis
    from sqlalchemy import create_engine
    import os 
    import sys
    
    table_name = 'nycha_policeservice'
    url = 'https://data.cityofnewyork.us/api/views/bvi6-r9nk/rows.csv?accessType=DOWNLOAD'
    f = Flow(
        load(url, name=table_name, format='csv', force_strings=True),
        dump_to_postgis(engine='env://DATAFLOWS_DB_ENGINE')
        )
    
    f.process()

    engine = create_engine(os.environ.get('DATAFLOWS_DB_ENGINE'))
    result = list(dict(x) for x in engine.execute(f'select wkb_geometry from {table_name} limit 1'))
    print(result)
    assert result == [{'wkb_geometry': '0101000020E61000001FCF149C617F52C0D7CE5B5AD9494440'}]
コード例 #22
0
ファイル: test_examples.py プロジェクト: vitaly-am/dataflows
def test_validate():
    from dataflows import Flow, validate, set_type, printer, ValidationError

    def adder(row):
        row['a'] += 0.5
        row['a'] = str(row['a'])

    f = Flow((dict(a=x) for x in range(10)), set_type('a', type='integer'),
             adder, validate(), printer())
    try:
        _ = f.process()
        assert False
    except ValidationError:
        pass
コード例 #23
0
ファイル: test_examples.py プロジェクト: vitaly-am/dataflows
def test_example_6():
    from dataflows import Flow, set_type, dump_to_path

    def all_triplets():
        for a in range(1, 21):
            for b in range(a, 21):
                for c in range(b + 1, 21):
                    yield dict(a=a, b=b, c=c)

    def filter_pythagorean_triplets(rows):
        for row in rows:
            if row['a']**2 + row['b']**2 == row['c']**2:
                yield row

    f = Flow(all_triplets(), set_type('a', type='integer'),
             set_type('b', type='integer'), set_type('c', type='integer'),
             filter_pythagorean_triplets,
             dump_to_path('out/pythagorean_triplets'))
    _ = f.process()
コード例 #24
0
ファイル: test_examples.py プロジェクト: vitaly-am/dataflows
def test_example_9():
    from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        filter_rows(equals=[dict(winner=1)]),
        concatenate(dict(emmy_nominee=['nominee'], ),
                    dict(name='emmies_filtered'),
                    resources='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        join(
            'emmies_filtered',
            ['emmy_nominee'],  # Source resource
            'oscars',
            ['Name'],  # Target resource
            full=False  # Don't add new fields, remove unmatched rows
        ),
        filter_rows(equals=[dict(Winner='1')]),
        dump_to_path('out/double_winners'))
    _ = f.process()
コード例 #25
0
    set_type(
        'Rate',
        resources='quarterly',
        type='number',
        description=
        'Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'
    ), set_type('Year', resources='annual', type='date', format='any'),
    set_type(
        'Rate',
        resources='annual',
        type='number',
        description=
        'Annual average yield from British Government Securities, 10 year Nominal Par Yield'
    ),
    update_resource('quarterly', **{
        'path': 'data/quarterly.csv',
        'dpp:streaming': True
    }),
    update_resource('annual', **{
        'path': 'data/annual.csv',
        'dpp:streaming': True
    }), validate(), dump_to_path())


def flow(parameters, datapackage, resources, stats):
    return bond_uk


if __name__ == '__main__':
    bond_uk.process()
コード例 #26
0
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Date', 'Rate'],
        format='csv',
        name='quarterly'
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Year', 'Rate'],
        format='csv',
        name='annual'
    ),
    set_type('Date', resources='quarterly', type='date', format='any'),
    set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'),
    set_type('Year', resources='annual', type='date', format='any'),
    set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'),
    update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)

def flow(parameters, datapackage, resources, stats):
    return bond_uk

if __name__ == '__main__':
    bond_uk.process()
コード例 #27
0
            }
        ],
        version="0.2.0",
        views=[
            {
              "name": "graph",
              "title": "VIX - CBOE Volatility Index",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["VIX Close"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv',
        headers=2,
        name='vix-daily'
    ),
    set_type('Date', type='date', format='any'),
    update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}),
    validate()
)


def flow(parameters, datapackage, resources, stats):
    return finance_vix


if __name__ == '__main__':
    finance_vix.process()
コード例 #28
0
ファイル: opec.py プロジェクト: gustavhempel/ETL-Pipeline
            ' ', '_', 'Country', '2017', '2018', '2019', '-', 'Q417', '1Q18',
            '2Q18', '3Q18', '4Q18', '1Q19', '2Q19', '3Q19', '4Q19'
        ]),
    load(
        load_source=
        'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx',
        format='xls',
        sheet=5,
        skip_rows=[1, 2, 3, 4, 5, 6],
        headers=[
            ' ', 'Country', '2016', '2017', '2018', '3Q19', '4Q19', '2019',
            'Change 19/18', '1Q20', '2Q20', '3Q20', '4Q20', 'Change 20/19'
        ]),
    load(
        load_source=
        'https://www.opec.org/opec_web/static_files_project/media/downloads/publications/MOMR%20Appendix%20Tables%20(April%202020).xlsx',
        format='xls',
        sheet=6,
        skip_rows=[1, 2, 3, 4, 5, 6],
        headers=[
            ' ', 'Country', '2017', '2018', '2019', 'Change 19/18', '2Q19',
            '3Q19', '4Q19', '1Q20', 'Feb20', 'Mar20', 'Change Mar/Feb'
        ]),
    rename_resources,
    validate(),
    printer(),
    dump_to_path('opec'),
)

oil_prices.process()
コード例 #29
0
            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)


def flow(parameters, datapackage, resources, stats):
    return bond_us


if __name__ == '__main__':
    bond_us.process()
コード例 #30
0
        "with": "{Region, subregion, country or area *}"
        },
        {
        "operation": "format",
        "target": "Country Code",
        "with": "{Country code}"
        },
        {
        "operation": "format",
        "target": "Year",
        "with": "{year}"
        },
        {
        "operation": "format",
        "target": "Population",
        "with": "{population}"
        }
    ]),
    delete_fields(fields=[
        'Type', 'Parent code', 'Region, subregion, country or area *', 'Country code', 'year', 'population'
    ], regex=False),
    validate(),
    dump_to_path()
)

def flow(parameters, datapackage, resources, stats):
    return population_estimates

if __name__ == '__main__':
    population_estimates.process()
コード例 #31
0
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='annual'
    ),
    extract_december_rows,
    load(
        load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='monthly'
    ),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    set_type('Date', resources='annual', type='yearmonth'),
    set_type('Price', resources='annual', type='number'),
    set_type('Date', resources='monthly', type='yearmonth'),
    set_type('Price', resources='monthly', type='number'),
    validate(),
    delete_fields(['Empty column'], resources=None)
)


def flow(parameters, datapackage, resources, stats):
    return gold_price_flow


if __name__ == '__main__':
    gold_price_flow.process()