コード例 #1
0
def test_add_computed_field_func():
    from dataflows import add_computed_field

    data = [dict(x=i) for i in range(3)]

    f = Flow(
        data,
        add_computed_field([
            dict(target=dict(name='sq', type='integer'),
                 operation=lambda row: row['x']**2),
            dict(target='f', operation='format', with_='{x} - {x}')
        ]))
    results, *_ = f.results()
    results = list(results[0])

    assert results == [
        dict(x=0, sq=0, f='0 - 0'),
        dict(x=1, sq=1, f='1 - 1'),
        dict(x=2, sq=4, f='2 - 2'),
    ]
コード例 #2
0
     unpivot_fields=[{'name': '([0-9]{4})', 'keys': {'year': '\\1'}}],
     extra_keys=[{'name': 'year', 'type': 'year'}],
     extra_value={'name': 'population', 'type': 'number'},
     resources=resource_names[1:]
 ),
 add_computed_field([
     {
     "operation": "format",
     "target": "Region",
     "with": "{Region, subregion, country or area *}"
     },
     {
     "operation": "format",
     "target": "Country Code",
     "with": "{Country code}"
     },
     {
     "operation": "format",
     "target": "Year",
     "with": "{year}"
     },
     {
     "operation": "format",
     "target": "Population",
     "with": "{population}"
     }
 ]),
 delete_fields(fields=[
     'Type', 'Parent code', 'Region, subregion, country or area *', 'Country code', 'year', 'population'
 ], regex=False),
 validate(),
 dump_to_path()
コード例 #3
0
def flow(parameters, *args):
    return Flow(
        add_computed_field(
            target=dict(name='date', type='date'),
            operation=lambda row: datetime.strftime(date, '%Y-%m-%d'),
            resources=parameters["resources"]))
コード例 #4
0
    set_type('Date', type='date', format='%d-%m-%y', resources=None),
    set_type('Case', type='number', resources=None),
    join(source_name='time_series_19-covid-Confirmed',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_19-covid-Deaths',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Confirmed={
             'name': 'Case',
             'aggregate': 'first'
         })),
    join(source_name='time_series_19-covid-Recovered',
         source_key=['Province/State', 'Country/Region', 'Date'],
         source_delete=True,
         target_name='time_series_19-covid-Deaths',
         target_key=['Province/State', 'Country/Region', 'Date'],
         fields=dict(Recovered={
             'name': 'Case',
             'aggregate': 'first'
         })),
    add_computed_field(target={
        'name': 'Deaths',
        'type': 'number'
    },
                       operation='format',
                       with_='{Case}'), delete_fields(['Case']),
    update_resource('time_series_19-covid-Deaths',
                    name='time-series-19-covid-combined',
                    path='time-series-19-covid-combined.csv'),
    dump_to_path()).results()[0]
コード例 #5
0
      fields=dict(Confirmed={
          'name': 'Case',
          'aggregate': 'first'
      })),
 join(source_name='time_series_19-covid-Recovered',
      source_key=['Province/State', 'Date'],
      source_delete=True,
      target_name='time_series_19-covid-Deaths',
      target_key=['Province/State', 'Date'],
      fields=dict(Recovered={
          'name': 'Case',
          'aggregate': 'first'
      })),
 add_computed_field(target={
     'name': 'Deaths',
     'type': 'number'
 },
                    operation='format',
                    with_='{Case}'),
 add_computed_field(target={
     'name': 'Country',
     'type': 'string'
 },
                    operation='format',
                    with_='{Country/Region}'),
 add_computed_field(target={
     'name': 'Province',
     'type': 'string'
 },
                    operation='format',
                    with_='{Province/State}'),
 delete_fields(['Case', 'Country/Region', 'Province/State']),
コード例 #6
0
def add_fields(names, type):
    return add_computed_field([
        dict(target=name, type=type, operation=(lambda row: None))
        for name in names
    ])
コード例 #7
0
     target_name="time_series_covid19_deaths_global",
     target_key=["Province/State", "Country/Region", "Date"],
     fields=dict(Recovered={
         "name": "Case",
         "aggregate": "first"
     }),
     mode="full-outer",
 ),
 # Add missing columns, e.g., after 'full-outer' join, the rows structure
 # is inconsistent
 fix_canada_recovered_data,
 add_computed_field(
     target={
         "name": "Deaths",
         "type": "number"
     },
     operation="format",
     with_="{Case}",
     resources=["time_series_covid19_deaths_global"],
 ),
 delete_fields(["Case"], resources=["time_series_covid19_deaths_global"]),
 update_resource(
     "time_series_covid19_deaths_global",
     name="time-series-19-covid-combined",
     path="data/time-series-19-covid-combined.csv",
 ),
 update_resource(
     "time_series_covid19_confirmed_US",
     name="us_confirmed",
     path="data/us_confirmed.csv",
 ),
コード例 #8
0
def main_flow(prefix=''):
    source_url = '{}data/publications_for_es/datapackage.json'.format(prefix)
    package = Package(source_url)
    all_fields = set(field.name for resource in package.resources
                     for field in resource.schema.fields)
    all_fields = dict((field_name, []) for field_name in all_fields)
    return Flow(
        load(source_url),
        lambda row: dict(row, json='{}'),
        concatenate(all_fields,
                    target=dict(name='publications', path='publications.csv')),
        delete_fields(['json']),
        prefer_gd('title'),
        prefer_gd('notes'),
        prefer_gd('publisher'),
        prefer_gd('tags'),
        prefer_gd('language_code'),
        prefer_gd('pubyear'),
        split_keyword_list('item_kind', 'gd_Item Type'),
        split_keyword_list('life_areas', 'gd_Life Domains'),
        split_keyword_list('source_kind', 'gd_Resource Type'),
        split_keyword_list('languages', 'language_code', ' '),
        split_keyword_list('tags', 'tags'),
        load('data/zotero/datapackage.json'),
        concatenate(dict(
            title=[],
            pubyear=[],
            publisher=[],
            authors=[],
            life_areas=[],
            notes=[],
            languages=[],
            tags=[],
            url=[],
            migdar_id=[],
            item_kind=[],
            source_kind=[],
            isbn=[],
            physical_description=[],
            publication_distribution_details=[],
            doc_id=[],
        ),
                    target=dict(name='publications', path='publications.csv')),
        set_type('title', **{'es:title': True}),
        set_type('notes', **{'es:hebrew': True}),
        set_type('publisher', **{'es:keyword': True}),
        add_field('year', 'integer', default=extract_year),
        split_and_translate('tags', 'tags', keyword=True),
        split_and_translate('life_areas', 'life_areas', keyword=True),
        split_and_translate('languages', 'languages', keyword=True),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        printer(),
        add_computed_field([
            {
                'operation': 'format',
                'target': 'doc_id',
                'with': KEY_PATTERN
            },
            {
                'operation': 'format',
                'target': 'page_title',
                'with': PAGE_TITLE_PATTERN
            },
        ]),
        add_computed_field([]),
    )
コード例 #9
0
            "path":
            "http://opendatacommons.org/licenses/pddl/",
            "title":
            "Open Data Commons Public Domain Dedication and License v1.0",
            'name':
            "open_data_commons_public_domain_dedication_and_license_v1.0"
        }],
        sources=[{
            "name": "Our Airports",
            "path": "http://ourairports.com/data/",
            "title": "Our Airports"
        }],
        readme=readme()),
    add_computed_field([{
        "operation": "format",
        "target": "coordinates",
        "with": "{latitude_deg}, {longitude_deg}"
    }]),
    delete_fields(fields=[
        "id", "longitude_deg", "latitude_deg", "scheduled_service",
        "home_link", "wikipedia_link", "keywords"
    ]), update_resource('airport-codes', **{'path': 'data/airport-codes.csv'}),
    validate(), dump_to_path())


def flow(parameters, datapackage, resources, stats):
    return dialing_info_cldr


if __name__ == '__main__':
    dialing_info_cldr.process()
コード例 #10
0
def flow(parameters):
    return Flow(
        add_computed_field(parameters.get('fields', []),
                           parameters.get('resources')), )
コード例 #11
0
def base_flow():
    sources, *_ = Flow(
        list_gdrive(),
        filter_rows(lambda row: (
            row['kind'] == 'drive#file' and
            row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )),
        add_field('filename', 'string',
                  default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)),
        download_files(),
        add_field('sheet', 'string'),
        add_field('headers', 'integer', 1),
        get_sheets(),
    ).results()
    return Flow(
        *[
            load(source['filename'],
                 sheet=source['sheet'],
                 headers=source['headers'],
                 infer_strategy=load.INFER_STRINGS,
                 cast_strategy=load.CAST_TO_STRINGS,
                 name=source['filename'])
            for source in sources[0]
        ],
        filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)),
        load('data/zotero/zotero.csv'),
        concatenate(
            fields={
                'migdar_id': [],
                'title': ['Title', ],
                'bib_title': [],
                'bib_related_parts': [],

                'notes': [],
                'tags': ['Tags'],
                'publisher': [],
                'languages': ['language_code'],
                'item_kind': ['Item Type', 'Item type', 'item_type'],
                'pubyear': ['pubyear/pubdate'],
                'life_areas': ['Life Domains', 'Domain'],
                'source_kind': ['Resource Type', 'Resource type'],
                'authors': ['author'],
                'url': ['URL'],

            },
            target=dict(
                name='publications',
                path='data/publications.csv'
            )
        ),
        fix_nones(),
        fix_urls(['url']),
        set_type('title',        **{'es:title': True}),
        set_type('authors',       **{'es:boost': True}),
        set_type('notes',        **{'es:hebrew': True}),
        set_type('publisher',    **{'es:boost': True}),
        add_field('year', 'integer',
                  default=extract_year),
        split_and_translate('tags', 'tags', keyword=True, delimiter=','),
        split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','),
        split_and_translate('languages', 'languages', keyword=True, delimiter=' '),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        fix_links('notes'), 
        verify_migdar_id(),
        add_computed_field([
            {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN},
            {'operation': 'format', 'target': 'page_title',
             'with': PAGE_TITLE_PATTERN},
        ]),
        add_field('title_kw', 'string',
                  default=lambda row: row.get('title'),
                  **{'es:keyword': True}),
    )
コード例 #12
0
    def flow(self):
        taxonomy = self.context.taxonomy
        txn_config = taxonomy.config
        fmt_str = [taxonomy.title + ' for:']
        fields = txn_config['key-fields']
        for f in fields:
            for ct in taxonomy.column_types:
                if ct['name'] == f:
                    fmt_str.append(
                        '%s: "{%s}",' % (ct['title'], f.replace(':', '-'))
                    )
                    break
        fmt_str = ' '.join(fmt_str)
        fields = [
            ct.replace(':', '-')
            for ct in fields
        ]
        all_fields = ['_source'] + fields

        TARGET = 'configurations'
        saved_config = self.config._unflatten()
        saved_config.setdefault('publish', {})['allowed'] = False

        return Flow(
            duplicate(RESOURCE_NAME, TARGET),
            join_with_self(
                TARGET,
                all_fields,
                dict((f, {}) for f in all_fields),
            ),
            add_computed_field(
                [
                    dict(
                        operation='format',
                        target='snippets',
                        with_=fmt_str
                    ),
                    dict(
                        operation='constant',
                        target='key_values',
                        with_=None
                    ),
                ],
                resources=TARGET
            ),
            add_field('config', 'object', saved_config, resources=TARGET),
            add_field('fields', type='object', 
                      default=self.collate_values(fields), resources=TARGET),
            join_with_self(
                TARGET,
                ['_source'],
                dict(
                    source=dict(name='_source'),
                    config={},
                    key_values=dict(aggregate='array'),
                    snippets=dict(aggregate='array'),
                )
            ),
            set_type('source', type='string'),
            set_type('config', type='object'),
            set_type('key_values', type='array'),
            set_type('snippets', type='array'),
            set_primary_key(['source']),
            dump_to_sql(
                dict([
                    (TARGET, {
                        'resource-name': TARGET,
                        'mode': 'update'
                    })
                ]),
                engine=self.lazy_engine(),
            ),
        )
コード例 #13
0
ファイル: pull.py プロジェクト: TheCulliganMan/covid-19
def data_pull_csv():
    unpivoting_fields = [{
        "name": r"([0-9]+\/[0-9]+\/[0-9]+)",
        "keys": {
            "Date": r"\1"
        }
    }]

    extra_keys = [{"name": "Date", "type": "string"}]
    extra_value = {"name": "Case", "type": "number"}

    Flow(
        load(f"{BASE_URL}{CONFIRMED}"),
        load(f"{BASE_URL}{RECOVERED}"),
        load(f"{BASE_URL}{DEATH}"),
        unpivot(unpivoting_fields, extra_keys, extra_value),
        find_replace([{
            "name": "Date",
            "patterns": [{
                "find": "/",
                "replace": "-"
            }]
        }]),
        to_normal_date,
        set_type("Date", type="date", format="%d-%m-%y", resources=None),
        set_type("Case", type="number", resources=None),
        join(
            source_name="time_series_19-covid-Confirmed",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Confirmed={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        join(
            source_name="time_series_19-covid-Recovered",
            source_key=["Province/State", "Country/Region", "Date"],
            source_delete=True,
            target_name="time_series_19-covid-Deaths",
            target_key=["Province/State", "Country/Region", "Date"],
            fields=dict(Recovered={
                "name": "Case",
                "aggregate": "first"
            }),
        ),
        add_computed_field(
            target={
                "name": "Deaths",
                "type": "number"
            },
            operation="format",
            with_="{Case}",
        ),
        delete_fields(["Case"]),
        update_resource(
            "time_series_19-covid-Deaths",
            name="time-series-19-covid-combined",
            path=RAW_OUTPUT_CSV,
        ),
        dump_to_path(),
    ).results()[0]
コード例 #14
0
def Olap_Datapackage():
    flow = Flow(
        # Load datapackages:
        load('elspot_prices_data/datapackage.json'),
        load('afrr_data/datapackage.json'),
        load('fcr_dk1_data/datapackage.json'),
        concatenate(fields={
            'Timestamp': ['HourUTC'],
            'Area': ['PriceArea'],
            'Product': ['product'],
            'Amount': ['amount'],
            'Price_DKK': ['PriceDKK'],
            'Price_EUR': ['PriceEUR']
        },
                    target={
                        'name': 'fact',
                        'path': 'data/fact.csv'
                    }),
        add_computed_field(
            [dict(target='id', operation='constant', with_='dummy')]),
        add_id,
        set_type('id', type='integer'),
        set_primary_key(primary_key=['id']),
        # Reorder so that 'id' column is the first:
        select_fields([
            'id', 'Timestamp', 'Area', 'Product', 'Amount', 'Price_DKK',
            'Price_EUR'
        ],
                      resources='fact'),
        # Add foreign keys:
        add_foreign_keys,
        # Fact table is ready. Now duplicate the resource to generate dim tables:
        # First is 'time' table:
        duplicate(source='fact', target_name='time', target_path='time.csv'),
        select_fields(['Timestamp'], resources=['time']),
        join_self(source_name='time',
                  source_key=['Timestamp'],
                  target_name='time',
                  fields={'Timestamp': {}}),
        # Parse datetime fields and add a separate field for year, month and day:
        add_computed_field([
            dict(target=dict(name='day', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%d'
                                                                           )),
            dict(target=dict(name='month', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%m'
                                                                           )),
            dict(target=dict(name='month_name', type='string'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%B'
                                                                           )),
            dict(target=dict(name='year', type='year'),
                 operation=lambda row: datetime.strptime(
                     row['Timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').strftime('%Y'
                                                                           )),
        ],
                           resources=['time']),
        set_primary_key(primary_key=['Timestamp'], resources=['time']),
        # Now 'area' table:
        duplicate(source='fact', target_name='area', target_path='area.csv'),
        select_fields(['Area'], resources=['area']),
        join_self(source_name='area',
                  source_key=['Area'],
                  target_name='area',
                  fields={'Area': {}}),
        set_primary_key(primary_key=['Area'], resources=['area']),
        # Now 'product' table:
        duplicate(source='fact',
                  target_name='product',
                  target_path='product.csv'),
        select_fields(['Product'], resources=['product']),
        join_self(source_name='product',
                  source_key=['Product'],
                  target_name='product',
                  fields={'Product': {}}),
        set_primary_key(primary_key=['Product'], resources=['product']),
        dump_to_path('olap_datapackage'))
    flow.process()
コード例 #15
0
ファイル: process.py プロジェクト: RajkumarMittal/covid-19-1
      source_key=['Province/State', 'Country/Region', 'Date'],
      source_delete=True,
      target_name='time_series_covid19_deaths_global',
      target_key=['Province/State', 'Country/Region', 'Date'],
      fields=dict(Recovered={
          'name': 'Case',
          'aggregate': 'first'
      }),
      mode='full-outer'),
 # Add missing columns, e.g., after 'full-outer' join, the rows structure
 # is inconsistent
 fix_canada_recovered_data,
 add_computed_field(target={
     'name': 'Deaths',
     'type': 'number'
 },
                    operation='format',
                    with_='{Case}',
                    resources=['time_series_covid19_deaths_global']),
 delete_fields(['Case'], resources=['time_series_covid19_deaths_global']),
 delete_fields(
     ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'],
     resources=[
         'time_series_covid19_confirmed_US', 'time_series_covid19_deaths_US'
     ]),
 update_resource('time_series_covid19_deaths_global',
                 name='time-series-19-covid-combined',
                 path='data/time-series-19-covid-combined.csv'),
 update_resource('time_series_covid19_confirmed_US',
                 name='us_confirmed',
                 path='data/us_confirmed.csv'),
コード例 #16
0
 DF.delete_fields(['alt_name[1-5]']),
 *[
     split_and_translate(
         f, f, 
         delimiter=',',
         keyword=f in ('org_kind', 'life_areas', 'languages', 'tags', 'compact_services')
     )
     for f in ('languages', 'life_areas', 'tags', 'regions', 'org_kind',
               'specialties', 'provided_services', 'target_audiences', 'compact_services')
 ],
 DF.add_field('title_kw', 'string',
              default=lambda row: row.get('org_name'),
              **{'es:keyword': True}),
 DF.add_computed_field(
     target='doc_id',
     operation='format',
     with_='org/{entity_id}'
 ),
 fix_doc_id,
 fix_links('objective'),
 fix_links('objective__en'),
 fix_links('objective__ar'),
 DF.add_field('year', 'integer', default=cur_year),
 DF.set_type('org_name',        **{'es:title': True}),
 DF.set_type('org_name__ar',    **{'es:title': True}),
 DF.set_type('alt_names',       
             **{'es:itemType': 'string', 'es:title': True}),
 *[
     DF.set_type(f, **{'es:index': False})
     for f in [
         'org_website', 'org_facebook', 'org_phone_number',
コード例 #17
0
 set_defaults,
 extrapulate_years,
 fix_values,
 DF.set_type('value', groupChar=',', bareNumber=True),
 fix_units,
 DF.set_type('extrapulation_years',
             type='array',
             **{'es:itemType': 'string'}),
 DF.validate(),
 DF.add_computed_field([
     dict(target=dict(name='life_areas',
                      type='array',
                      **{
                          'es:itemType': 'string',
                          'es:keyword': True
                      }),
          operation=lambda row: [
              x for x in
              [row.get('life_area{}'.format(i)) for i in range(1, 4)]
              if x is not None
          ])
 ]),
 DF.delete_fields(['life_area{}'.format(i) for i in range(1, 4)]),
 DF.join_self(
     'out', ['chart_title', 'series_title'], 'out',
     dict([(k, None) for k in CHART_FIELDS + SERIES_FIELDS] +
          [(k, dict(aggregate='array')) for k in ['year', 'value']] +
          [('max_year', dict(name='year', aggregate='max'))])),
 verify_percents,
 DF.add_computed_field([
     dict(target=dict(name='dataset', type='array'),