Esempio n. 1
0
def test_filter_rows():
    from dataflows import filter_rows

    f = Flow(
        [
            {
                'a': 1,
                'b': 3
            },
            {
                'a': 2,
                'b': 3
            },
            {
                'a': 1,
                'b': 4
            },
            {
                'a': 2,
                'b': 4
            },
        ],
        filter_rows(equals=[dict(a=1)]),
        filter_rows(not_equals=[dict(b=3)]),
    )
    results, _, _ = f.results()
    assert results[0][0] == dict(a=1, b=4)
    assert len(results[0]) == 1
    assert len(results) == 1
Esempio n. 2
0
def test_example_8():
    from dataflows import Flow, load, dump_to_path

    def find_double_winners(package):

        # Remove the emmies resource - we're going to consume it now
        package.pkg.remove_resource('emmies')
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)

        # Emmies is the first - read all its data and create a set of winner names
        emmy = next(resources)
        emmy_winners = set(
            map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy)))

        # Oscars are next - filter rows based on the emmy winner set
        academy = next(resources)
        yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners,
                     academy)

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        find_double_winners,
        dump_to_path('out/double_winners'))
    _ = f.process()
Esempio n. 3
0
def test_example_5():
    from dataflows import Flow, set_type, dump_to_path

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','),
             dump_to_path('out/country_population'))
    _ = f.process()
Esempio n. 4
0
def test_load_duplicate_headers_with_deduplicate_headers_flag():
    from dataflows import load
    flow = Flow(load('data/duplicate_headers.csv', deduplicate_headers=True), )
    data, package, stats = flow.results()
    assert package.descriptor['resources'][0]['schema']['fields'] == [
        {
            'name': 'header1',
            'type': 'string',
            'format': 'default'
        },
        {
            'name': 'header2 (1)',
            'type': 'string',
            'format': 'default'
        },
        {
            'name': 'header2 (2)',
            'type': 'string',
            'format': 'default'
        },
    ]
    assert data == [[
        {
            'header1': 'value1',
            'header2 (1)': 'value2',
            'header2 (2)': 'value3'
        },
    ]]
Esempio n. 5
0
def test_example_7():
    from dataflows import Flow, load, dump_to_path

    def add_is_guitarist_column(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(
            dict(name='is_guitarist', type='boolean'))
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)
        beatles = next(resources)

        def f(row):
            row['is_guitarist'] = row['instrument'] == 'guitar'
            return row

        yield map(f, beatles)

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists'))
    _ = f.process()
Esempio n. 6
0
def test_filter_rows_callable():
    from dataflows import filter_rows

    f = Flow(
        [
            {
                'a': 1,
                'b': 3
            },
            {
                'a': 2,
                'b': 3
            },
            {
                'a': 1,
                'b': 4
            },
            {
                'a': 2,
                'b': 4
            },
        ],
        filter_rows(condition=lambda row: row['a'] > 1 and row['b'] < 4),
    )
    results, _, _ = f.results()
    assert results[0][0] == dict(a=2, b=3)
    assert len(results[0]) == 1
    assert len(results) == 1
Esempio n. 7
0
def test_concatenate():
    from dataflows import concatenate

    f = Flow([
        {
            'a': 1,
            'b': 2
        },
        {
            'a': 2,
            'b': 3
        },
        {
            'a': 3,
            'b': 4
        },
    ], [
        {
            'c': 4,
            'd': 5
        },
        {
            'c': 5,
            'd': 6
        },
        {
            'c': 6,
            'd': 7
        },
    ], concatenate({
        'f1': ['a'],
        'f2': ['b', 'c'],
        'f3': ['d']
    }))
    results, _, _ = f.results()
    assert results[0] == [{
        'f1': 1,
        'f2': 2,
        'f3': None
    }, {
        'f1': 2,
        'f2': 3,
        'f3': None
    }, {
        'f1': 3,
        'f2': 4,
        'f3': None
    }, {
        'f1': None,
        'f2': 4,
        'f3': 5
    }, {
        'f1': None,
        'f2': 5,
        'f3': 6
    }, {
        'f1': None,
        'f2': 6,
        'f3': 7
    }]
Esempio n. 8
0
def test_example_75():
    from dataflows import Flow, load, dump_to_path


    def add_is_guitarist_column_to_schema(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict(
            name='is_guitarist',
            type='boolean'
        ))
        # Must yield the modified datapackage
        yield package.pkg
        yield from package

    def add_is_guitarist_column(row):
        row['is_guitarist'] = row['instrument'] == 'guitar'
        return row

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column_to_schema,
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists2')
    )
    _ = f.process()
Esempio n. 9
0
def test_example_3():
    from dataflows import Flow

    f = Flow(
        country_population(),
    )
    data, *_ = f.results()
Esempio n. 10
0
def store_destination_output_package(destination_output, csv_temp_files):
    logging.info("Storing destination output package")
    os.makedirs(destination_output, exist_ok=True)
    logging.info("Writing to destination_output dir: " + destination_output)
    last_package = {}
    if os.path.exists(os.path.join(destination_output, "datapackage.json")):

        def _load_last_package(row):
            last_package[row['name']] = row
            yield row

        Flow(
            load(os.path.join(destination_output, "datapackage.json")),
            _load_last_package
        ).process()

    def _files_list():
        for temp_filepath, name in csv_temp_files.items():
            target_filepath = os.path.join(destination_output, name)
            shutil.move(temp_filepath, target_filepath)
            os.chmod(target_filepath, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
            size = os.path.getsize(target_filepath)
            hash = get_hash(target_filepath)
            last_row = last_package.get(name)
            if last_row and hash == last_row.get('hash') and size == last_row['size']:
                mtime = last_row['mtime']
            else:
                mtime = datetime.datetime.fromtimestamp(os.path.getmtime(target_filepath))
            yield {"name": name, "size": size, "mtime": mtime, "hash": hash}

    Flow(
        _files_list(),
        update_resource(-1, name='files_list', path='files_list.csv'),
        dump_to_path(destination_output),
    ).process()
Esempio n. 11
0
def test_duplicate():
    from dataflows import duplicate

    a = [
        {
            'a': 1,
            'b': 3
        },
        {
            'a': 2,
            'b': 3
        },
        {
            'a': 3,
            'b': 1
        },
        {
            'a': 4,
            'b': 1
        },
    ]

    f = Flow(
        a,
        duplicate(),
    )
    results, _, _ = f.results()
    assert list(results[0]) == a
    assert list(results[1]) == a
Esempio n. 12
0
 def flow(self):
     from dataflows import Flow
     if self.flows:
         return Flow(self.flows[1], )
     elif self.analyzers:
         return super().flow()
     else:
         return Flow()
Esempio n. 13
0
def test_example_4():
    from dataflows import Flow, set_type

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','))
    data, dp, _ = f.results()

    print(data[0][:10])
Esempio n. 14
0
def test_select_field():
    from dataflows import select_fields
    f = Flow(data, select_fields(['y']))
    results, dp, _ = f.results()
    for i in results[0]:
        assert list(i.keys()) == ['y']
    assert dp.descriptor['resources'][0]['schema']['fields'] == \
        [dict(name='y', type='string', format='default')]
Esempio n. 15
0
def test_add_metadata():
    from dataflows import add_metadata
    f = Flow(
        data,
        add_metadata(author='Adam Kariv')
    )
    _, dp, _ = f.results()
    assert dp.descriptor['author'] == 'Adam Kariv'
Esempio n. 16
0
def test_example_2():
    from dataflows import Flow, load

    def titleName(row):
        row['name'] = row['name'].title()

    f = Flow(load('data/beatles.csv'), titleName)
    data, *_ = f.results()
Esempio n. 17
0
def test_rename_resource2():
    from dataflows import Flow, printer, update_resource

    f = Flow(({
        'a': x
    } for x in range(10)), update_resource(None, name='renamed'), printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert dp.descriptor['resources'][0]['name'] == 'renamed'
Esempio n. 18
0
def test_load_from_package():
    from dataflows import dump_to_path, load

    Flow([{'foo': 'bar'}], dump_to_path('data/load_from_package')).process()

    ds = Flow(load('data/load_from_package/datapackage.json')).datastream()

    assert len(ds.dp.resources) == 1
    assert [list(res) for res in ds.res_iter] == [[{'foo': 'bar'}]]
Esempio n. 19
0
def operator(name, params, pipeline):
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file:
        params['dgpConfig'].setdefault('publish', {})['allowed'] = True
        metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {})
        metadata['title'] = name
        metadata['dag_id'] = pipeline['id']
        metadata['updated_at'] = pipeline['__updated_at']
        metadata['created_at'] = pipeline['__created_at']
        for k, v in params.items():
            if k.startswith('extra.'):
                set_dots(params['dgpConfig'], k, v)
        logging.info('\nCONFIGURATION:\n--------------\n%s', 
                     json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2))
        yaml.dump(params['dgpConfig'], config_file)
        config_file.flush()
        config = Config(config_file.name)
        taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
        context = Context(config, taxonomy_registry)

        logging.getLogger().setLevel(logging.INFO)

        steps = [
            FileLoaderDGP,
            LoaderDGP,
            PostLoaderDGP,
            TransformDGP,
            EnricherDGP,
            PublisherDGP,
        ]

        dgp = SimpleDGP(
            config, context,
            steps=steps
        )

        ret = dgp.analyze()
        if not ret:
            logging.error('Errors:')
            logging.error('\n\t - '.join([str(x) for x in dgp.errors]))
            assert False

        # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', 
        #              json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2))

        logging.info('Creating Flow')
        flow = dgp.flow()
        flow = Flow(
            flow,
            printer(tablefmt='html')
        )
        logging.info('Running Flow')
        _, stats = flow.process()

        logging.info('Success')

        return stats
Esempio n. 20
0
def test_unpivot_any_resources():
    from dataflows import unpivot, validate
    data1 = [
        dict([('name', 'ike{}'.format(i))] +
             [(str(year), year + i) for year in range(1990, 2020, 10)])
        for i in range(5)
    ]
    data2 = [
        dict([('city', 'mike{}'.format(i))] +
             [(str(year), year + i) for year in range(2050, 2080, 10)])
        for i in range(5)
    ]
    f = Flow(
        data1, data2,
        unpivot([dict(name='([0-9]+)', keys=dict(year='\\1'))],
                [dict(name='year', type='integer')],
                dict(name='amount', type='integer')), validate())
    results, _, _ = f.results()
    assert results[0] == [
        dict(zip(['name', 'year', 'amount'], r)) for r in [
            ['ike0', 1990, 1990],
            ['ike0', 2000, 2000],
            ['ike0', 2010, 2010],
            ['ike1', 1990, 1991],
            ['ike1', 2000, 2001],
            ['ike1', 2010, 2011],
            ['ike2', 1990, 1992],
            ['ike2', 2000, 2002],
            ['ike2', 2010, 2012],
            ['ike3', 1990, 1993],
            ['ike3', 2000, 2003],
            ['ike3', 2010, 2013],
            ['ike4', 1990, 1994],
            ['ike4', 2000, 2004],
            ['ike4', 2010, 2014],
        ]
    ]
    assert results[1] == [
        dict(zip(['city', 'year', 'amount'], r)) for r in [
            ['mike0', 2050, 2050],
            ['mike0', 2060, 2060],
            ['mike0', 2070, 2070],
            ['mike1', 2050, 2051],
            ['mike1', 2060, 2061],
            ['mike1', 2070, 2071],
            ['mike2', 2050, 2052],
            ['mike2', 2060, 2062],
            ['mike2', 2070, 2072],
            ['mike3', 2050, 2053],
            ['mike3', 2060, 2063],
            ['mike3', 2070, 2073],
            ['mike4', 2050, 2054],
            ['mike4', 2060, 2064],
            ['mike4', 2070, 2074],
        ]
    ]
Esempio n. 21
0
def test_load_override_schema_and_fields():
    from dataflows import load
    flow = Flow(
        load('data/beatles_age.csv',
             override_schema={
                 'title': 'title',
                 'missingValues': ['ringo'],
             },
             override_fields={
                 'age': {
                     'type': 'string'
                 },
             }), )
    data, package, stats = flow.results()
    assert package.descriptor == {
        'profile':
        'data-package',
        'resources': [{
            'format': 'csv',
            'name': 'beatles_age',
            'path': 'beatles_age.csv',
            'profile': 'tabular-data-resource',
            'schema': {
                'fields': [{
                    'format': 'default',
                    'name': 'name',
                    'type': 'string'
                }, {
                    'format': 'default',
                    'name': 'age',
                    'type': 'string'
                }],
                'missingValues': ['ringo'],
                'title':
                'title',
            }
        }]
    }
    assert data == [[
        {
            'name': 'john',
            'age': '18'
        },
        {
            'name': 'paul',
            'age': '16'
        },
        {
            'name': 'george',
            'age': '17'
        },
        {
            'name': None,
            'age': '22'
        },
    ]]
Esempio n. 22
0
def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()
Esempio n. 23
0
def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()
Esempio n. 24
0
def test_example_1():
    from dataflows import Flow

    data = [{'data': 'Hello'}, {'data': 'World'}]

    def lowerData(row):
        row['data'] = row['data'].lower()

    f = Flow(data, lowerData)
    data, *_ = f.results()
Esempio n. 25
0
def spew_flow(flow, ctx: ProcessorContext):
    flow = Flow(
        update_package(**ctx.datapackage),
        load((ctx.datapackage, ctx.resource_iterator)),
        flow,
    )
    datastream = flow.datastream()
    ctx.datapackage = datastream.dp.descriptor
    ctx.resource_iterator = datastream.res_iter
    ctx.stats = MergeableStats(datastream.stats, ctx.stats)
def main(request_times_api_url):
    metadata = {}
    stats = collections.defaultdict(int)
    instance_stats = collections.defaultdict(int)
    Flow(get_builds(request_times_api_url, stats),
         aggregate_instance_stats(instance_stats, metadata),
         dump_to_path('data/aggregate_request_times')).process()
    Flow(get_instance_stats_data(instance_stats, metadata),
         dump_to_path('data/aggregate_request_times_stats'),
         printer(num_rows=1)).process()
Esempio n. 27
0
def test_expected_contact_with_patient():
    print("test_expected_contact_with_patient")
    back_from_abroad_db = [169603, 169632, 169813]
    contact_with_patient_db = [10722, 10715, 10697]
    Flow(
        load_from_db.flow({
            "where":
            "id in (%s)" %
            ", ".join(map(str, back_from_abroad_db + contact_with_patient_db))
        }),
        add_gps_coordinates.flow({
            "source_fields":
            get_parameters_from_pipeline_spec(
                "pipeline-spec.yaml", "corona_data_collector",
                "corona_data_collector.add_gps_coordinates")["source_fields"],
            "get-coords-callback":
            lambda street, city: (random.uniform(29, 34), random.uniform(
                34, 36), int(street != city))
        }),
        export_corona_bot_answers.flow({
            "destination_output":
            "data/corona_data_collector/destination_output"
        }),
    ).process()
    contact_with_patient_key = values_to_convert['insulation_status'][
        'contact-with-patient']
    back_from_abroad_key = values_to_convert['insulation_status'][
        'back-from-abroad']
    contact_with_patient_array = []
    back_from_abroad_array = []
    counts = {"contact_with_patient": 0, "back_from_abroad": 0}

    def _test(row):
        if int(row["isolation"]) == contact_with_patient_key:
            counts["contact_with_patient"] += 1
            contact_with_patient_array.append(int(row["id"]))
        if int(row["isolation"]) == back_from_abroad_key:
            assert int(row["id"]) in back_from_abroad_db
            counts["back_from_abroad"] += 1
            back_from_abroad_array.append(int(row["id"]))

    Flow(
        load(
            'data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv'
        ),
        load(
            'data/corona_data_collector/destination_output/corona_bot_answers_22_3_2020_with_coords.csv'
        ),
        _test,
    ).process()
    assert 3 == counts["contact_with_patient"], str(counts)
    assert 3 == counts["back_from_abroad"], str(counts)
    assert set(back_from_abroad_array) == set(back_from_abroad_db)
    assert set(contact_with_patient_array) == set(contact_with_patient_db)
    print("OK")
Esempio n. 28
0
def test_load_from_env_var():
    import os
    from dataflows import load, dump_to_path

    Flow([{'foo': 'bar'}], dump_to_path('out/load_from_env_var')).process()

    os.environ['MY_DATAPACKAGE'] = 'out/load_from_env_var/datapackage.json'
    results, dp, _ = Flow(load('env://MY_DATAPACKAGE')).results()

    assert len(dp.resources) == 1
    assert results == [[{'foo': 'bar'}]]
Esempio n. 29
0
def test_sort_reverse_many_rows():
    from dataflows import sort_rows

    f = Flow(
        ({'a': i, 'b': i % 5} for i in range(1000)),
        sort_rows(key='{b}{a}', reverse=True, batch_size=0),
    )
    results, _, _ = f.results()
    results = results[0]
    assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}]
    assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]
Esempio n. 30
0
def test_update_schema():
    from dataflows import Flow, printer, update_schema, validate

    f = Flow([['a', '-'], ['a', 0]], update_schema(-1, missingValues=['-']),
             validate(), printer())
    results, dp, stats = f.results()
    print(dp.descriptor)
    assert results[0] == [
        dict(col0='a', col1=None),
        dict(col0='a', col1=0),
    ]
Esempio n. 31
0
bond_uk = Flow(
    add_metadata(
        name="bond-yields-uk-10y",
        title= "10y UK Government Bond Yields (long-term interest rate)",
        sources=[
            {
              "name": "Bank of England",
              "path": "http://www.bankofengland.co.uk/boeapps/iadb/index.asp?Travel=NIxIRx&levels=1&XNotes=Y&C=DUS&G0Xtop.x=51&G0Xtop.y=7&XNotes2=Y&Nodes=X41514X41515X41516X41517X55047X76909X4051X4052X4128X33880X4053X4058&SectionRequired=I&HideNums=-1&ExtraInfo=true#BM",
              "title": "Bank of England"
            }
        ],
        licenses=[
            {
              "id": "odc-pddl",
              "path": "http://opendatacommons.org/licenses/pddl/",
              "name": "public_domain_dedication_and_license"
            }
        ],
        views=[
            {
              "name": "graph",
              "title": "Average yield from British Government Securities, 10 year Nominal Par Yield",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Date', 'Rate'],
        format='csv',
        name='quarterly'
    ),
    load(
        load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963',
        skip_rows=[1],
        headers=['Year', 'Rate'],
        format='csv',
        name='annual'
    ),
    set_type('Date', resources='quarterly', type='date', format='any'),
    set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'),
    set_type('Year', resources='annual', type='date', format='any'),
    set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'),
    update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)
Esempio n. 32
0
finance_vix = Flow(
    add_metadata(
        name="finance-vix",
        title= "VIX - CBOE Volatility Index",
        homepage= 'http://www.cboe.com/micro/VIX/',
        sources=[
            {
              "name": "CBOE VIX Page",
              "path": "http://www.cboe.com/micro/vix/historical.aspx",
              "title": "CBOE VIX Page"
            }
        ],
        licenses=[
            {
              "id": "odc-pddl",
              "path": "http://opendatacommons.org/licenses/pddl/",
              "title": "Open Data Commons Public Domain Dedication and License v1.0",
              'name': "open_data_commons_public_domain_dedication_and_license_v1.0"
            }
        ],
        version="0.2.0",
        views=[
            {
              "name": "graph",
              "title": "VIX - CBOE Volatility Index",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["VIX Close"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv',
        headers=2,
        name='vix-daily'
    ),
    set_type('Date', type='date', format='any'),
    update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}),
    validate()
)
Esempio n. 33
0
bond_us = Flow(
    add_metadata(
        name="bond-yields-us-10y",
        title="10 year US Government Bond Yields (long-term interest rate)",
        version="0.2.0",
        sources=[
            {
              "name": "Federal Reserve (Release H.15)",
              "path": "http://www.federalreserve.gov/releases/h15/data.htm",
              "title": "Federal Reserve (Release H.15)"
            }
        ],
        licenses=[
            {
              "id": "odc-pddl",
              "path": "http://opendatacommons.org/licenses/pddl/",
              "title": "Open Data Commons Public Domain Dedication and License v1.0",
              'name': "open_data_commons_public_domain_dedication_and_license_v1.0"
            }
        ],
        views=[
            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)
Esempio n. 34
0
gold_price_flow = Flow(
    add_metadata(
        name="gold-prices",
        title="Gold Prices",
        homepage='http://www.bundesbank.de',
        licenses=[
            {
                "id": "odc-pddl",
                "name": "public_domain_dedication_and_license",
                "version": "1.0",
                "url": "http://opendatacommons.org/licenses/pddl/1.0/"
            }
        ],
        sources=[
            {
              "name": "bundesbank-gold-prices",
              "path": "'http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its'",
              "title": "Bundesbank gold prices"
            }
        ],
        views=[
            {
                "name": "graph",
                "title": "Gold Prices (Monthly in USD)",
                "specType": "simple",
                "spec": {
                    "type": "lines-and-points",
                    "group": "Date",
                    "series": [
                        "Price"
                    ]
                }
            }
        ],
        related=[
            {
                "title": "Oil prices",
                "path": "/core/oil-prices",
                "publisher": "core",
                "formats": ["CSV", "JSON"]
            },
            {
                "title": "Natural gas",
                "path": "/core/natural-gas",
                "publisher": "core",
                "formats": ["CSV", "JSON"]
            }
        ],
        version="0.2.0"
    ),
    load(
        load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='annual'
    ),
    extract_december_rows,
    load(
        load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its',
        skip_rows=[1, 2, 3, 4, 5, -1],
        headers=['Date', 'Price', 'Empty column'],
        format='csv',
        name='monthly'
    ),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}),
    set_type('Date', resources='annual', type='yearmonth'),
    set_type('Price', resources='annual', type='number'),
    set_type('Date', resources='monthly', type='yearmonth'),
    set_type('Price', resources='monthly', type='number'),
    validate(),
    delete_fields(['Empty column'], resources=None)
)