Ejemplos de dump_to_path en Python, ejemplos de dataflows.dump_to_path en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: aggregate request times.py Proyecto: zelima/ckan-cloud-operator

def main(request_times_api_url):
    metadata = {}
    stats = collections.defaultdict(int)
    instance_stats = collections.defaultdict(int)
    Flow(get_builds(request_times_api_url, stats),
         aggregate_instance_stats(instance_stats, metadata),
         dump_to_path('data/aggregate_request_times')).process()
    Flow(get_instance_stats_data(instance_stats, metadata),
         dump_to_path('data/aggregate_request_times_stats'),
         printer(num_rows=1)).process()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: aggregate access logs.py Proyecto: zelima/ckan-cloud-operator

def main(package_url):
    jenkins_user_token = ckan_manager.get_jenkins_token(
        'ckan-cloud-operator-jenkins-creds')
    package_url = package_url.replace(
        'https://', 'https://{}:{}@'.format(*jenkins_user_token))
    stats_rows = []
    Flow(load(package_url), aggregate_stats(stats_rows),
         dump_to_path('data/aggregate_access_logs')).process()
    Flow((row for row in stats_rows),
         dump_to_path('data/aggregate_access_logs_stats'),
         printer()).process()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_examples.py Proyecto: fahadashrafi/dataflows

def test_example_75():
    from dataflows import Flow, load, dump_to_path


    def add_is_guitarist_column_to_schema(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(dict(
            name='is_guitarist',
            type='boolean'
        ))
        # Must yield the modified datapackage
        yield package.pkg
        yield from package

    def add_is_guitarist_column(row):
        row['is_guitarist'] = row['instrument'] == 'guitar'
        return row

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column_to_schema,
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists2')
    )
    _ = f.process()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: prepare_data_for_es.py Proyecto: hasadna/migdar-data-pipelines

def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/unique_records_full/datapackage.json',
             resources=['unique_records']),
        load('data/app_records_full/datapackage.json',
             resources=['search_app_records']),
        add_field('__revision', 'integer', REVISION),
        *(add_field(f['name'], f['type']) for f in STATUS_FIELDS),
        manage_revisions,
        *(dump_to_sql(
            {
                DB_TABLE: {
                    'resource-name': resource_name,
                    'mode': 'update',
                    'update_keys': KEY_FIELDS
                }
            }, DATAFLOWS_DB_ENGINE)
          for resource_name in ['unique_records', 'search_app_records']),
        *(add_field(f'rev_{name}', 'date')
          for name in ['last_updated_at', 'last_modified_at', 'created_at']),
        set_revisions,
        filter_rows(equals=[{
            '__next_update_days': FILTER_NEXT_UPDATE_DAYS
        }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(),
        dump_to_path('data/publications_for_es'),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['doc_id']),
        update_resource(None, **{'dpp:streaming': True}))

Ejemplo n.º 5

0

Mostrar archivo

def es_dumper(resource_name, revision, path):
    now = time.time()
    return DF.Flow(
        update_pk('doc_id'),
        DF.add_field('revision', 'integer', default=revision),
        DF.add_field('score', 'number', default=1),
        DF.add_field('create_timestamp', 'number', now),
        my_dump_to_es(indexes={
            'migdar__' + resource_name: [{
                'resource-name': resource_name,
                'revision': revision
            }]
        },
                      mapper_cls=BoostingMappingGenerator,
                      index_settings={'index.mapping.coerce': True},
                      elasticsearch_options=dict(timeout=60)),
        DF.dump_to_path('data/{}'.format(path)),
        collate(revision),
        my_dump_to_es(indexes={
            'migdar__docs': [{
                'resource-name': resource_name,
                'revision': revision
            }]
        },
                      mapper_cls=BoostingMappingGenerator,
                      index_settings={'index.mapping.coerce': True}),
        DF.update_resource(None, **{'dpp:streaming': True}),
        DF.printer(),
    )

Ejemplo n.º 6

0

Mostrar archivo

def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load(
            'https://migdar-internal-search.odata.org.il/__data/search_import/index.csv',
            encoding='utf-8',
            http_session=get_migdar_session()),
        update_resource('index',
                        name='search_import_index',
                        path='search_import_index.csv'),
        load_from_gdrive_files,
        update_resource('search_import_index',
                        name='search_import',
                        path='search_import.csv',
                        schema={
                            'fields': [{
                                'name': n,
                                'type': 'string'
                            } for n in SEARCH_IMPORT_FIELD_NAMES]
                        },
                        **{'dpp:streaming': True}),
        printer(num_rows=20,
                tablefmt='plain' if is_dpp else 'html',
                fields=['migdar_id', 'pubyear', 'title']),
        dump_to_path('data/search_import_from_gdrive'))

Ejemplo n.º 7

0

Mostrar archivo

def join_unique_records(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/search_import_from_gdrive/datapackage.json',
             resources=['search_import']),
        load('data/search_results/unique_records.csv',
             resources=['unique_records']),
        set_type('migdar_id',
                 type='string',
                 resources=['unique_records', 'search_import']),
        join(source_name='search_import',
             source_key=['migdar_id'],
             target_name='unique_records',
             target_key=['migdar_id'],
             fields={
                 f'gd_{field}': {
                     'name': field
                 }
                 for field in SEARCH_IMPORT_FIELD_NAMES
             },
             full=False),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['migdar_id']),
        dump_to_path('data/unique_records_full'),
        update_resource(None, **{'dpp:streaming': True}))

Ejemplo n.º 8

0

Mostrar archivo

 def run_flow(datetime_format=None):
     Flow([{
         'today': str(_today),
         'now': str(_now)
     }], set_type('today', type='date'),
          set_type('now', type='datetime', format=datetime_format),
          dump_to_path('out/dump_dates')).process()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_examples.py Proyecto: vitaly-am/dataflows

def test_example_7():
    from dataflows import Flow, load, dump_to_path

    def add_is_guitarist_column(package):

        # Add a new field to the first resource
        package.pkg.descriptor['resources'][0]['schema']['fields'].append(
            dict(name='is_guitarist', type='boolean'))
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)
        beatles = next(resources)

        def f(row):
            row['is_guitarist'] = row['instrument'] == 'guitar'
            return row

        yield map(f, beatles)

    f = Flow(
        # Same one as above
        load('data/beatles.csv'),
        add_is_guitarist_column,
        dump_to_path('out/beatles_guitarists'))
    _ = f.process()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_examples.py Proyecto: vitaly-am/dataflows

def test_example_8():
    from dataflows import Flow, load, dump_to_path

    def find_double_winners(package):

        # Remove the emmies resource - we're going to consume it now
        package.pkg.remove_resource('emmies')
        # Must yield the modified datapackage
        yield package.pkg

        # Now iterate on all resources
        resources = iter(package)

        # Emmies is the first - read all its data and create a set of winner names
        emmy = next(resources)
        emmy_winners = set(
            map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy)))

        # Oscars are next - filter rows based on the emmy winner set
        academy = next(resources)
        yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners,
                     academy)

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        find_double_winners,
        dump_to_path('out/double_winners'))
    _ = f.process()

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_examples.py Proyecto: vitaly-am/dataflows

def test_example_5():
    from dataflows import Flow, set_type, dump_to_path

    f = Flow(country_population(),
             set_type('population', type='number', groupChar=','),
             dump_to_path('out/country_population'))
    _ = f.process()

Ejemplo n.º 12

0

Mostrar archivo

def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )

Ejemplo n.º 13

0

Mostrar archivo

Archivo: helpers.py Proyecto: saeelparsekar/emojis

def generate_package():
    package_flow = Flow(
        add_metadata(
            name="unicode-emojis",
            title="UTS #51 Unicode Emoji",
            descriptor=(
                "List of emojis available from the Unicode Consortium. "
                "More information can be found in the Unicode® Technical Standard #51."
            ),
            sources=[
                {
                    "name": "unicode-emoji",
                    "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt",
                    "title": "UTS #51 Unicode Emoji",
                },
            ],
            licenses=[
                {
                    "name": "ODC-PDDL-1.0",
                    "path": "http://opendatacommons.org/licenses/pddl/",
                    "title": "Open Data Commons Public Domain Dedication and License v1.0",
                }
            ],
            keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"],
        ),
        load(load_source="data/emojis.csv", format="csv",),
        validate(),
        dump_to_path(),
    )
    package_flow.process()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: addresses.py Proyecto: hasadna/datacity-businessgate

def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]

Ejemplo n.º 15

0

Mostrar archivo

def operator(name, params):
    connection_string = params['db_url']
    source_table = params['db_table']
    target_instance_name = params['target_instance_name']
    target_package_id = params['target_package_id']
    target_organization_id = params['target_organization_id']

    print('starting db_fetcher operator')
    print(
        'source_table={} target_instance_name={} target_package_id={} target_organization_id={}'
        .format(source_table, target_instance_name, target_package_id,
                target_organization_id))
    with tempfile.TemporaryDirectory() as tempdir:
        csv_filename = target_package_id + '.csv'
        DF.Flow(
            DF.load(connection_string,
                    table=source_table,
                    name=target_package_id,
                    infer_strategy=DF.load.INFER_PYTHON_TYPES),
            DF.update_resource(-1, path=csv_filename),
            DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process()
        csv_filename = os.path.join(tempdir, csv_filename)
        print('{}, {:,} bytes'.format(csv_filename,
                                      os.stat(csv_filename).st_size))
        update_package(target_instance_name, target_organization_id,
                       target_package_id, target_package_id,
                       [('CSV', csv_filename)])

Ejemplo n.º 16

0

Mostrar archivo

def get_secondary_chain(source_chain, secondary, num_secondaries, workdir):
    for step_idx, step in enumerate(source_chain):
        serverless_step_config = getattr(step, '__serverless_step', None)
        if serverless_step_config:
            wait_primary_step_complete(secondary, num_secondaries, step_idx,
                                       workdir)
            notify_complete = partial(notify_secondary_step_complete, step_idx,
                                      secondary, num_secondaries, workdir,
                                      serverless_step_config)
            print('secondary {}/{}: running step {} flow'.format(
                secondary, num_secondaries, step_idx))
            try:
                Flow(
                    load(
                        PRIMARY_INPUT_DATAPACKAGE_FILE_TEMPLATE.format(
                            workdir=workdir, step_idx=step_idx)),
                    get_secondary_step(step, serverless_step_config, secondary,
                                       num_secondaries, step_idx, workdir),
                    dump_to_path(
                        SECONDARY_OUTPUT_DATAPACKAGE_PATH_TEMPLATE.format(
                            workdir=workdir,
                            secondary=secondary,
                            step_idx=step_idx))).process()
            except Exception as e:
                notify_complete(str(e))
                raise
            notify_complete(None)
    return [[]]

Ejemplo n.º 17

0

Mostrar archivo

def judges_flow(out_path):
    return Flow(
        get_tribunals(),
        update_resource(['res_1'], name='tribunals', path='tribunals.csv'),
        checkpoint('judges_tribunals'), get_judges(),
        update_resource(['res_2'], name='judges_list', path='judges_list.csv'),
        set_type('Is_In_Dimus_List', resources=['judges_list'],
                 type='boolean'), checkpoint('judges_judges_list'),
        join('tribunals', ['Tribunal_Code'],
             'judges_list', ['Tribunal_Code'],
             fields={
                 'Tribunal_Type_Code': {},
                 'Tribunal_Arkaa_Code': {
                     'name': 'Arkaa_Code'
                 },
                 'Tribunal_District_Code': {
                     'name': 'District_Code'
                 },
                 'Tribunal_Name': {
                     'name': 'Name'
                 }
             }), fetch_judges_details, checkpoint('judges_details'),
        add_field('tribunal_type_name', 'string'), parse_judges_extra_details,
        checkpoint('judges_extra_details'), parse_judge_events,
        dump_to_path(out_path), printer(num_rows=1))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: historic_data.py Proyecto: wsheffel/budgetkey-data-pipelines

def prepare():
    for resource_name, load in loads:
        DF.Flow(
            load,
            # DF.printer(tablefmt='html'),
            DF.concatenate(
                FIELD_MAPPING,
                dict(name=resource_name, path=resource_name + '.csv')),
            DF.set_type('activity_name',
                        type='string',
                        constraints=dict(required=True),
                        on_error=DF.schema_validator.drop),
            DF.set_type('allocated_budget',
                        type='number',
                        groupChar=',',
                        bareNumber=False),
            DF.set_type('num_beneficiaries',
                        type='number',
                        groupChar=',',
                        bareNumber=False,
                        on_error=DF.schema_validator.ignore),
            fix_beneficiaries,
            DF.set_type('num_beneficiaries', type='string'),
            multiply_budget,
            fill_org_hierarchy,
            # DF.printer(tablefmt='html'),
            DF.dump_to_path('tmp/' + resource_name),
        ).process()

Ejemplo n.º 19

0

Mostrar archivo

def list_instances():
    os.makedirs('data/list_instances', exist_ok=True)
    data = []
    Flow((get_instance_row(instance)
          for instance in ckan_instance_manager.list_instances(full=True)),
         dump_to_json(data), dump_to_path('data/list_instances'),
         printer(num_rows=99999)).process()
    with open('data/list_instances.json', 'w') as f:
        json.dump(data, f)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_lib.py Proyecto: vitaly-am/dataflows

def test_load_from_package():
    from dataflows import dump_to_path, load

    Flow([{'foo': 'bar'}], dump_to_path('data/load_from_package')).process()

    ds = Flow(load('data/load_from_package/datapackage.json')).datastream()

    assert len(ds.dp.resources) == 1
    assert [list(res) for res in ds.res_iter] == [[{'foo': 'bar'}]]

Ejemplo n.º 21

0

Mostrar archivo

def dump_print_flow(flow,
                    dump_path,
                    num_rows=1,
                    fields=None,
                    checkpoint_name=None):
    return Flow(flow,
                checkpoint(checkpoint_name) if checkpoint_name else None,
                dump_to_path(dump_path),
                printer(num_rows=num_rows, fields=fields))

Ejemplo n.º 22

0

Mostrar archivo

Archivo: social_service_suppliers.py Proyecto: OpenBudget/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        DF.load('/var/datapackages/activities/social_services/datapackage.json'),
        DF.add_field('entity_id', 'string'),
        DF.add_field('soproc_supplier', 'boolean'),
        unwind(),
        DF.select_fields(['entity_id', 'soproc_supplier']),
        DF.dump_to_path('/var/datapackages/activities/social_services_suppliers'),
    )

Ejemplo n.º 23

0

Mostrar archivo

Archivo: flow.py Proyecto: ColinMaudry/decp-table-schema-utils

def decp_processing():
    flow = Flow(

        # Chargement du CSV suite à la conversion depuis JSON
        load("decp.csv"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),

        # Tri par rootId et seq pour préparer à la création de donneesActuelles
        sort_rows('{rootId}:{seq}', resources=0, reverse=True),
        donnees_actuelles,

        # rootId et seq peuvent maintenant être supprimés
        delete_fields(["rootId", "seq"], resources=0, regex=False),
        sort_rows('{datePublicationDonnees}', resources=0, reverse=True),

        # Nouvelle table dédiée aux marchés, sans données sur les titulaires
        print("Création de la table dédiée aux marchés..."),
        duplicate(source="decp",
                  target_name="decp-sans-titulaires",
                  target_path="decp-sans-titulaires.csv",
                  duplicate_to_end=True),
        delete_fields([
            "titulaire.id", "titulaire.denominationSociale",
            "titulaire.typeIdentifiant"
        ],
                      resources="decp-sans-titulaires",
                      regex=False),
        set_primary_key(["uid"], resources="decp-sans-titulaires"),
        deduplicate(),

        # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données
        # print("Téléchargement des données tabulaires précédentes..."),
        # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"),
        # set_type("acheteur.id", type="string"),
        # set_type("titulaire.id", type="string"),
        # set_type("codeCPV", type="string"),
        # set_type("lieuExecution.code", type="string"),
        # delete_fields(["rowid"], resources="previous-decp", regex=False),
        # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."),
        # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]),

        # Chargement des précédentes données dédiées aux titulaires
        print("Chargement des données titulaires..."),
        load("decp-titulaires.csv", name="decp-titulaires"),
        set_type("acheteur.id", type="string"),
        set_type("titulaire.id", type="string"),
        set_type("codeCPV", type="string"),
        set_type("lieuExecution.code", type="string"),
        set_type("departement", type="string"),
        set_type("codeAPE", type="string"),
        print("Enregistrement des données sur le disque..."),
        dump_to_path("decp"))
    flow.process()

Ejemplo n.º 24

0

Mostrar archivo

Archivo: update.py Proyecto: loleg/opendatach-stats

def conference_csv():
    flow = Flow(
        # Load inputs
        load(
            od19_base + od19_feedback,
            name='feedback',
            format='csv',
        ),
        load(
            od19_base + od19_analysis,
            name='analysis',
            format='csv',
        ),
        # Process them
        set_type("Anzahl.*", type='integer', resources='analysis'),
        delete_fields([
            "Anzahl Auflistung",
            ".*\\(Formel\\)",
            ".*Duplikate",
        ],
                      resources='analysis'),
        not_empty_groupcol,
        # Save the results
        add_metadata(
            name='opendatach19',
            title='''Opendata.ch/2019 Forum''',
            licenses=[{
                "name":
                "ODC-PDDL-1.0",
                "path":
                "http://opendatacommons.org/licenses/pddl/",
                "title":
                "Open Data Commons Public Domain Dedication and License v1.0"
            }],
            maintainers=[{
                "name": "Oleg Lavrovsky",
                "web": "https://datalets.ch/"
            }],
            views=[{
                "name": "Groups",
                "resources": ["analysis"],
                "spec": {
                    "group": "Alle "
                    "Bedürfnisse"
                    "",
                    "series": ["Anzahl Auflistung (Zahl)"],
                    "type": "bar"
                },
                "specType": "simple",
                "title": "Topic counts"
            }]),
        printer(),
        validate(),
        dump_to_path('data/opendatach19'),
    )
    flow.process()

Ejemplo n.º 25

0

Mostrar archivo

def main(instance_ids_or_names, approve_code):
    instance_ids_or_names = [
        i.strip() for i in instance_ids_or_names.split(',') if i.strip()
    ]
    approve_code = approve_code.strip()
    logs.info(instance_ids_or_names=instance_ids_or_names,
              approve_code=approve_code)
    Flow(delete_instances(instance_ids_or_names, approve_code),
         dump_to_path('data/delete_instances'),
         printer(num_rows=9999)).process()

Ejemplo n.º 26

0

Mostrar archivo

    def prepare(self):
        self.ref_hash = md5(self.REF_DATAPACKAGE.encode('utf8')).hexdigest()
        self.key = self.__class__.__name__

        check = checkpoint(self.ref_hash)
        if not check.exists():
            Flow(load(self.REF_DATAPACKAGE),
                 rename_last_resource(self.ref_hash),
                 dump_to_path('.cache/{}'.format(self.ref_hash)),
                 check).process()
        logger.debug('DONE PREPARING %s', self.key)

Ejemplo n.º 27

0

Mostrar archivo

def AFRR_Data():
    unpivoting_fields = [{
        'name': 'aFRR_DownActivated',
        'keys': {
            'product': 'aFRR_DownActivated'
        }
    }, {
        'name': 'aFRR_UpActivated',
        'keys': {
            'product': 'aFRR_UpActivated'
        }
    }]
    extra_keys = [{'name': 'product', 'type': 'string'}]
    extra_value = {'name': 'amount', 'type': 'number'}
    flow = Flow(
        # Load inputs - using 'datastore_search_sql' API load last 10k rows:
        load(
            'https://api.energidataservice.dk/datastore_search_sql?sql=select%20*%20from%20afrrreservesdk1%20order%20by%20"HourUTC"%20desc%20limit%201000',
            format="json",
            property="result.records",
            name="fact_afrr"),
        # Remove extra fields:
        delete_fields(fields=['_id', '_full_text', 'HourDK']),
        # Save the results
        checkpoint('afrr'),
        # Normalize/unpivot:
        unpivot(unpivoting_fields, extra_keys, extra_value),
        add_computed_field([
            dict(target=dict(name='PriceArea', type='string'),
                 operation='constant',
                 with_='DK1'),
            dict(target=dict(name='PriceDKK', type='number'),
                 operation='constant',
                 with_='dummy'),
            dict(target=dict(name='PriceEUR', type='number'),
                 operation='constant',
                 with_='dummy')
        ]),
        add_price,
        delete_fields(fields=[
            'aFRR_DownPriceDKK', 'aFRR_DownPriceEUR', 'aFRR_UpPriceDKK',
            'aFRR_UpPriceEUR'
        ]),
        add_metadata(name='marketdata', title='Marketdata prototype'),
        update_resource(resources=None, mediatype='text/csv'),
        update_resource(
            resources='fact_afrr',
            title='Automatic Frequency Restoration Reserves',
            source=
            'https://www.energidataservice.dk/dataset/afrrreservesdk1/resource_extract/0694e216-6713-4f84-9b98-7bb5bc11d80c'
        ),
        printer(),
        dump_to_path('afrr_data'))
    flow.process()

Ejemplo n.º 28

0

Mostrar archivo

def test_load_from_env_var():
    import os
    from dataflows import load, dump_to_path

    Flow([{'foo': 'bar'}], dump_to_path('out/load_from_env_var')).process()

    os.environ['MY_DATAPACKAGE'] = 'out/load_from_env_var/datapackage.json'
    results, dp, _ = Flow(load('env://MY_DATAPACKAGE')).results()

    assert len(dp.resources) == 1
    assert results == [[{'foo': 'bar'}]]

Ejemplo n.º 29

0

Mostrar archivo

Archivo: scraper.py Proyecto: odedsh/budgetkey-data-pipelines

def flow(parameters, *_):
    year = parameters['year']
    return DF.Flow(
        DF.load(wrapper(year),
                format='csv',
                infer_strategy=DF.load.INFER_STRINGS,
                cast_strategy=DF.load.CAST_DO_NOTHING),
        DF.update_resource(
            None, **{
                'dpp:streaming': True,
                'name': 'supports',
                'path': 'data/supports.csv'
            }), DF.dump_to_path(f'/var/datapackages/supports/yearly-{year}'))

Ejemplo n.º 30

0

Mostrar archivo

def update_dataset():
    flow = Flow(
        # Load inputs
        load(f'{BASE_URL}{CONFIRMED}'),
        load(f'{BASE_URL}{RECOVERED}'),
        load(f'{BASE_URL}{DEATH}'),
        checkpoint('load_data'),
        # Process them (if necessary)
        # Save the results
        add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''),
        printer(),
        dump_to_path(),
    )
    flow.process()

Ejemplo n.º 31

0

Mostrar archivo

Archivo: bond_us_flow.py Proyecto: datasets/bond-yields-us-10y

            {
              "name": "graph",
              "title": "10 year US Government Bond Yields (Monthly granuarlity)",
              "specType": "simple",
              "spec": {"type": "line","group": "Date","series": ["Rate"]}
            }
        ],
        readme=readme()
    ),
    load(
        load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn',
        skip_rows=[i+1 for i in range(6)],
        headers=['Date', 'Rate'],
        format='csv',
        name='monthly'
    ),
    set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'),
    set_type('Rate', type='number', description='Percent per year'),
    update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}),
    validate(),
    dump_to_path()
)


def flow(parameters, datapackage, resources, stats):
    return bond_us


if __name__ == '__main__':
    bond_us.process()