Python filter_rows Exemples, dataflows.filter_rows Python Exemples

Exemple #1

0

Afficher le fichier

def test_filter_rows():
    from dataflows import filter_rows

    f = Flow(
        [
            {
                'a': 1,
                'b': 3
            },
            {
                'a': 2,
                'b': 3
            },
            {
                'a': 1,
                'b': 4
            },
            {
                'a': 2,
                'b': 4
            },
        ],
        filter_rows(equals=[dict(a=1)]),
        filter_rows(not_equals=[dict(b=3)]),
    )
    results, _, _ = f.results()
    assert results[0][0] == dict(a=1, b=4)
    assert len(results[0]) == 1
    assert len(results) == 1

Exemple #2

0

Afficher le fichier

Fichier : broken_links.py Projet : hasadna/migdar-data-pipelines

def broken_links_flow():
    return DF.Flow(
        *[
            DF.Flow(
                DF.load(URL_TEMPLATE.format(**c), name=c['name']),
                DF.add_field('__name',
                             'string',
                             c['name'],
                             resources=c['name']),
                DF.add_field('__title',
                             'string',
                             get_title(c['title']),
                             resources=c['name']),
            ) for c in configuration
        ],
        DF.add_field('urls', 'array', lambda r: RE.findall(str(r))),
        DF.add_field('link', 'string',
                     lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)),
        DF.concatenate(
            dict(
                name=['__name'],
                title=['__title'],
                link=[],
                urls=[],
            )),
        DF.add_field('url', 'string'),
        DF.add_field('error', 'string'),
        unwind(),
        DF.delete_fields(['urls']),
        DF.parallelize(check_broken(), 4),
        DF.filter_rows(lambda r: r['error'] is not None),
    )

Exemple #3

0

Afficher le fichier

Fichier : filter.py Projet : slallum/datapackage-pipelines

def flow(parameters):
    return Flow(
        filter_rows(
            equals=parameters.get('in', []),
            not_equals=parameters.get('out', []),
            resources=parameters.get('resources'),
        ))

Exemple #4

0

Afficher le fichier

def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )

Exemple #5

0

Afficher le fichier

Fichier : prepare_data_for_es.py Projet : hasadna/migdar-data-pipelines

def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/unique_records_full/datapackage.json',
             resources=['unique_records']),
        load('data/app_records_full/datapackage.json',
             resources=['search_app_records']),
        add_field('__revision', 'integer', REVISION),
        *(add_field(f['name'], f['type']) for f in STATUS_FIELDS),
        manage_revisions,
        *(dump_to_sql(
            {
                DB_TABLE: {
                    'resource-name': resource_name,
                    'mode': 'update',
                    'update_keys': KEY_FIELDS
                }
            }, DATAFLOWS_DB_ENGINE)
          for resource_name in ['unique_records', 'search_app_records']),
        *(add_field(f'rev_{name}', 'date')
          for name in ['last_updated_at', 'last_modified_at', 'created_at']),
        set_revisions,
        filter_rows(equals=[{
            '__next_update_days': FILTER_NEXT_UPDATE_DAYS
        }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(),
        dump_to_path('data/publications_for_es'),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['doc_id']),
        update_resource(None, **{'dpp:streaming': True}))

Exemple #6

0

Afficher le fichier

def test_filter_rows_callable():
    from dataflows import filter_rows

    f = Flow(
        [
            {
                'a': 1,
                'b': 3
            },
            {
                'a': 2,
                'b': 3
            },
            {
                'a': 1,
                'b': 4
            },
            {
                'a': 2,
                'b': 4
            },
        ],
        filter_rows(condition=lambda row: row['a'] > 1 and row['b'] < 4),
    )
    results, _, _ = f.results()
    assert results[0][0] == dict(a=2, b=3)
    assert len(results[0]) == 1
    assert len(results) == 1

Exemple #7

0

Afficher le fichier

Fichier : test_examples.py Projet : vitaly-am/dataflows

def test_example_9():
    from dataflows import Flow, load, dump_to_path, join, concatenate, filter_rows

    f = Flow(
        # Emmy award nominees and winners
        load('data/emmy.csv', name='emmies'),
        filter_rows(equals=[dict(winner=1)]),
        concatenate(dict(emmy_nominee=['nominee'], ),
                    dict(name='emmies_filtered'),
                    resources='emmies'),
        # Academy award nominees and winners
        load('data/academy.csv', encoding='utf8', name='oscars'),
        join(
            'emmies_filtered',
            ['emmy_nominee'],  # Source resource
            'oscars',
            ['Name'],  # Target resource
            full=False  # Don't add new fields, remove unmatched rows
        ),
        filter_rows(equals=[dict(Winner='1')]),
        dump_to_path('out/double_winners'))
    _ = f.process()

Exemple #8

0

Afficher le fichier

Fichier : social_services.py Projet : OpenBudget/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        all_units(),
        DF.add_field('office', 'string', lambda r: r['path'][0]
                     if len(r['path']) > 0 else None, **{'es:keyword': True}),
        DF.add_field('unit', 'string', lambda r: r['path'][1]
                     if len(r['path']) > 1 else None, **{'es:keyword': True}),
        DF.add_field('subunit', 'string', lambda r: r['path'][2]
                     if len(r['path']) > 2 else None, **{'es:keyword': True}),
        DF.add_field('subsubunit', 'string', lambda r: r['path'][3]
                     if len(r['path']) > 3 else None, **{'es:keyword': True}),
        DF.add_field('breadcrumbs', 'string',
                     lambda r: '/'.join(r['path']) or 'משרדי הממשלה',
                     **{'es:exclude': True}),
        DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main',
                     **{'es:exclude': True}),
        DF.delete_fields([
            'path',
        ]),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', 2020),
        DF.add_field('kind', 'string', 'gov_social_service_unit', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('score', 'number', 1000, **{'es:score-column': True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='units', **{'dpp:streaming': True}),

        # Ensure we only have the main offices
        DF.filter_rows(lambda r: r['unit'] is None),
        DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'),
        DF.dump_to_path('/var/datapackages/units/social_services'),
        DF.dump_to_sql(dict(units={'resource-name': 'units'})))

Exemple #9

0

Afficher le fichier

Fichier : btl.py Projet : odedsh/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        scrape(),
        DF.update_resource(-1, **{
            'dpp:streaming': True,
            'name': 'btl'
        }),
        DF.set_type('claim_date',
                    type='datetime',
                    format='%d/%m/%Y %H:%M',
                    resources=-1),
        DF.set_type('start_date', type='date', format='%d/%m/%Y',
                    resources=-1),
        DF.filter_rows(lambda r: r['publication_id']),
        calculate_publication_id(7),
    )

Exemple #10

0

Afficher le fichier

Fichier : negev_galil.py Projet : OpenBudget/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        scraper(),
        DF.filter_rows(lambda row: row['page_title'] and row['page_title'].
                       startswith('קול קורא'),
                       resources=-1),
        DF.set_type('start_date', type='date', format='%d/%m/%Y',
                    resources=-1),
        DF.set_type('claim_date',
                    type='datetime',
                    format='%d/%m/%Y',
                    resources=-1),
        calculate_publication_id(9),
        DF.validate(),
        DF.update_resource(-1, name='negev_galil', **{PROP_STREAMING: True}),
    )

Exemple #11

0

Afficher le fichier

Fichier : naama_scraper.py Projet : odedsh/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        get_updated_sources(),
        DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')),
        DF.validate(),
        DF.filter_rows(lambda r: r['publication_id']),
        DF.add_field('tender_type', 'string',
                     lambda r: TENDER_KINDS[r['tender_type_he']],
                     **{'es:keyword': True}),
        DF.join_with_self(
            'tenders', KEY,
            dict((k, dict(aggregate='last'))
                 for k in list(TENDER_MAPPING.keys()) + ['tender_type'])),
        DF.set_type('publication_id', type='string', transform=str),
        DF.set_type('supplier_id', type='string', transform=str),
        DF.set_type('tender_id',
                    type='string',
                    transform=lambda v: v or 'none'),
        DF.set_type('.+_date',
                    type='date',
                    format='%d.%m.%Y',
                    on_error=DF.schema_validator.clear),
        DF.set_type('subjects',
                    type='string',
                    transform=lambda v: ';'.join(x.strip()
                                                 for x in v.split(','))
                    if v else ''),
        DF.set_type('claim_date',
                    type='datetime',
                    transform=lambda v, field_name, row: datetime.datetime.
                    combine(v, row['claim_time'] or datetime.time(0))
                    if v else None),
        DF.set_type('tender_type_he', **{'es:keyword': True}),
        DF.delete_fields(['claim_time']),
        DF.add_field(
            'page_url', 'string', lambda r:
            f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'),
        DF.add_field('page_title', 'string', lambda r: r['description']),
        DF.add_field('reason', 'string', lambda r: r['regulation']),
        DF.add_field('documents', 'array', []),
        DF.add_field('contact', 'string'),
        DF.add_field('contact_email', 'string'),
        DF.validate(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

Exemple #12

0

Afficher le fichier

def flow(*_):
    return DF.Flow(
        services(),
        DF.delete_fields(
            ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']),
        DF.add_field('publisher_name', 'string', lambda r: r['office'],
                     **{'es:keyword': True}),
        splitter('target_audience'),
        splitter('subject'),
        splitter('intervention'),
        splitter('target_age_group'),
        floater('beneficiaries'),
        floater('budgetItems'),
        floater('manualBudget'),
        floater('tenders'),
        floater('suppliers'),
        floater('virtue_of_table'),
        fix_suppliers(),
        fix_tenders(),
        add_current_budget(),
        add_current_beneficiaries(),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', CURRENT_YEAR),
        DF.add_field('kind', 'string', 'gov_social_service', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירות חברתי', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.set_type('name', **{'es:title': True}),
        DF.set_type('description', **{
            'es:itemType': 'string',
            'es:boost': True
        }),
        DF.add_field('score', 'number', get_score, **{'es:score-column':
                                                      True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='activities', **{'dpp:streaming': True}),
        DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})),
        DF.filter_rows(lambda r: not r['deleted']),
        DF.delete_fields(['deleted']),
        DF.dump_to_path('/var/datapackages/activities/social_services'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})),
    )

Exemple #13

0

Afficher le fichier

Fichier : zotero.py Projet : hasadna/migdar-data-pipelines

def flow(*args):
    return DF.Flow(
        get(),
        DF.filter_rows(
            lambda row: bool(row['key']) and bool(row.get('title'))),
        simplify_tags, extract_tags('life_areas', ['Domain']),
        extract_tags('source_kind', ['Source', 'Resource', 'Resouce']),
        DF.add_field(
            'authors', 'string', lambda r: None
            if not r.get('creators') else ', '.join(
                ('{name}'.format(**c)
                 if 'name' in c else '{firstName} {lastName}'.format(**c))
                for c in r.get('creators', [])
                if c.get('creatorType') == 'author')),
        DF.add_field('item_kind', 'string',
                     lambda r: r.get('reportType') or r.get('itemKind')),
        DF.concatenate(MAPPING,
                       target={
                           'name': 'zotero',
                           'path': 'zotero.csv'
                       }), DF.dump_to_path('data/zotero'),
        DF.update_resource(None, **{'dpp:streaming': True}), DF.printer())

Exemple #14

0

Afficher le fichier

Fichier : negev_galil.py Projet : wsheffel/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        scraper(),
        DF.filter_rows(lambda row: row['page_title'] and row['page_title'].startswith('קול קורא'), resources=-1),
        page_parser(),
        DF.add_field('decision', 'string',
                     default=lambda row: row['parsed']['decision'], resources=-1),
        DF.add_field('start_date', 'date', format='%d/%m/%Y',
                     default=lambda row: row['parsed']['start_date'], resources=-1),
        DF.add_field('claim_date', 'datetime', format='%d/%m/%Y',
                     default=lambda row: row['parsed']['claim_date'], resources=-1),
        DF.add_field('documents', 'array',
                     default=lambda row: row['parsed']['documents'], resources=-1),
        DF.delete_fields(['parsed'], resources=-1),
        calculate_publication_id(9),
        DF.validate(),
        DF.update_resource(
            -1, name='negev_galil',
            **{
                PROP_STREAMING: True
            }
        ),
    )

Exemple #15

0

Afficher le fichier

Fichier : datagov_fetcher.py Projet : hasadna/datacity-ckan-dgp

def loader(name, cat):
    return DF.Flow(
        DF.load('mosadot.xlsx'),
        DF.concatenate(
            dict(
                municipality=['מועצה אזורית'],
                town=['שם יישוב'],
                name=['שם המוסד'],
                kind=['סוג המוסד'],
                address=['כתובת'],
                status=['סטטוס'],
                target_audience=['קהל יעד'],
                area=['שטח'],
                lat=['Y'],
                lon=['X'],
            )),
        fixer,
        category(),
        DF.filter_rows(lambda r: r['category'] == cat),
        geo(),
        #     DF.join_with_self('concat', ['kind'], dict(kind=None)),
        DF.update_resource(-1, name=name, path=name + '.csv'),
        DF.dump_to_path(name),
    ).results()[0][0]

Exemple #16

0

Afficher le fichier

import dataflows as DF
import glob

x = DF.Flow(
    ({'filename': x[:-4]} for x in glob.glob('*png')),
    DF.update_resource(-1, name='files'),
    DF.load('http://api.yodaat.org/data/orgs_in_es/data/orgs.csv', name='orgs'),
    DF.join(
        'files', '{filename}',
        'orgs', '{entity_id}',
        {
            'filename': {},
        }, full=True, source_delete=True
    ),
    DF.filter_rows(lambda row: row['filename'] is None),
    DF.select_fields(['org_name', 'entity_id']),
    DF.printer()
).process()

Exemple #17

0

Afficher le fichier

Fichier : i18n.py Projet : hasadna/migdar-data-pipelines

        for k in LANGS
        if row.get(k)
    )
    values = list(set(values))
    return values


translations = {}
for source, gid in sources.items():
    url = URL.format(gid)
    translations[source] = DF.Flow(
        DF.load(url),
        clean_row,
        DF.add_field('values', 'array',
                     default=extract_values),
        DF.filter_rows(lambda row: row['hebrew']),
        DF.select_fields(list(LANGS) + ['values'])
    ).results()[0][0]
    tx = {}
    complained = set()
    for row in translations[source]:
        v = row.get('values')
        if not v:
            continue
        for vv in v:
            vv = clean(vv)
            if tx.get(vv) not in (None, row):
                if vv not in complained:
                    complained.add(vv)
            tx[vv] = row
    if len(complained) > 0:

Exemple #18

0

Afficher le fichier

Fichier : publications.py Projet : hasadna/migdar-data-pipelines

def base_flow():
    sources, *_ = Flow(
        list_gdrive(),
        filter_rows(lambda row: (
            row['kind'] == 'drive#file' and
            row['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )),
        add_field('filename', 'string',
                  default=lambda row: 'pubfiles/{modifiedTime}-{id}.xlsx'.format(**row)),
        download_files(),
        add_field('sheet', 'string'),
        add_field('headers', 'integer', 1),
        get_sheets(),
    ).results()
    return Flow(
        *[
            load(source['filename'],
                 sheet=source['sheet'],
                 headers=source['headers'],
                 infer_strategy=load.INFER_STRINGS,
                 cast_strategy=load.CAST_TO_STRINGS,
                 name=source['filename'])
            for source in sources[0]
        ],
        filter_rows(lambda row: row.get('migdar_id') not in ('', 'None', None)),
        load('data/zotero/zotero.csv'),
        concatenate(
            fields={
                'migdar_id': [],
                'title': ['Title', ],
                'bib_title': [],
                'bib_related_parts': [],

                'notes': [],
                'tags': ['Tags'],
                'publisher': [],
                'languages': ['language_code'],
                'item_kind': ['Item Type', 'Item type', 'item_type'],
                'pubyear': ['pubyear/pubdate'],
                'life_areas': ['Life Domains', 'Domain'],
                'source_kind': ['Resource Type', 'Resource type'],
                'authors': ['author'],
                'url': ['URL'],

            },
            target=dict(
                name='publications',
                path='data/publications.csv'
            )
        ),
        fix_nones(),
        fix_urls(['url']),
        set_type('title',        **{'es:title': True}),
        set_type('authors',       **{'es:boost': True}),
        set_type('notes',        **{'es:hebrew': True}),
        set_type('publisher',    **{'es:boost': True}),
        add_field('year', 'integer',
                  default=extract_year),
        split_and_translate('tags', 'tags', keyword=True, delimiter=','),
        split_and_translate('life_areas', 'life_areas', keyword=True, delimiter=','),
        split_and_translate('languages', 'languages', keyword=True, delimiter=' '),
        split_and_translate('source_kind', 'source_kind', keyword=True),
        split_and_translate('item_kind', 'item_kind', keyword=True),
        fix_links('notes'), 
        verify_migdar_id(),
        add_computed_field([
            {'operation': 'format', 'target': 'doc_id', 'with': KEY_PATTERN},
            {'operation': 'format', 'target': 'page_title',
             'with': PAGE_TITLE_PATTERN},
        ]),
        add_field('title_kw', 'string',
                  default=lambda row: row.get('title'),
                  **{'es:keyword': True}),
    )

Exemple #19

0

Afficher le fichier

            r['rank'] = i + 1
            yield r
    return func

def sort_limit_scores():
    def func(row):
        row['scores'] = sorted(row.get('scores', []), key=lambda r: r['date'])[-30:]
    return func

if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(), name='cities', headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field('score_date', 'object', lambda r: dict(
            date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))
        ),
        DF.concatenate(dict(
            id=[], city_name=[], score_date=[]
        ), target=dict(name='ranking')),
        DF.join_with_self('ranking', '{city_name}', dict(
            id=None, city_name=None, scores=dict(name='score_date', aggregate='array')
        )),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200),
        DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']),
        DF.sort_rows('{sortkey}', reverse=True),
        DF.delete_fields(['sortkey']),
        DF.add_field('rank', 'integer', 0),

Exemple #20

0

Afficher le fichier

                weeks[-1].append(dict(weekday=weeks[-1][-1]['weekday'] + 1))
        assert all(len(x) == 7 for x in weeks)
        row['scores'] = weeks

    return func


if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(),
                name='cities',
                headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field(
            'score_date', 'object',
            lambda r: dict(weekday=r['date'].isoweekday() % 7,
                           date=r['date'].toordinal(),
                           sr=float(r['symptoms_ratio_weighted'] or 0),
                           nr=int(r['num_reports_weighted']))),
        DF.concatenate(dict(id=[], city_name=[], score_date=[]),
                       target=dict(name='popup_data')),
        DF.join_with_self(
            'popup_data', '{city_name}',
            dict(id=None,
                 city_name=None,
                 scores=dict(name='score_date', aggregate='array'))),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'] is not None),

Exemple #21

0

Afficher le fichier

Fichier : map_coloring.py Projet : hasadna/avid-covider-raw-data

    return func


if __name__ == '__main__':

    city_fill_color_cases = ['case']
    city_fill_pattern_cases = ['case']
    neighborhood_fill_color_cases = ['case']
    neighborhood_fill_pattern_cases = ['case']

    r, _, _ = DF.Flow(
        DF.load(latest_file(),
                name='cities',
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.load(latest_file(),
                name='out',
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.add_field('city_area_id', 'string',
                     lambda r: r['area_id'].split('-')[0]),
        DF.join('cities', ['city_area_id'], 'out', ['city_area_id'],
                dict(num_city_reports=dict(name='num_reports_weighted'))),
        DF.add_field('desc', 'string', ''),
        DF.add_field('kind', 'string', ''),
        DF.add_field('property', 'string', ''),
        props(),
        DF.join_with_self(
            'out', ['is_city', 'kind', 'desc', 'property'],
            dict(is_city=None,

Exemple #22

0

Afficher le fichier

 load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}),
 {% endif %}
 {% if input == 'remote' %}
 load('{{input_url}}', format='{{format}}', {% if sheet %}sheet={{sheet}}{% endif %}),
 {% endif %}
 {% if input == 'sql' %}
 load('{{input_url}}', table='{{input_db_table}}'),
 {% endif %}
 {% if input == 'other' %}
 {% endif %}
 # Process them (if necessary)
 {% if 'sort' in processing %}
 sort_rows('{field_name}'),  # Key is a Python format string or a list of field names
 {% endif %}
 {% if 'filter' in processing %}
 filter_rows(),
 {% endif %}
 {% if 'find_replace' in processing %}
 find_replace([
     dict(name='field_name',
          patterns=[
              dict(find='re-pattern-to-find', replace='re-pattern-to-replace-with'),                     
          ])
 ]),
 {% endif %}
 {% if 'delete_fields' in processing %}
 delete_fields(['field_name']),  # Pass a list of field names to delete from the data
 {% endif %}
 {% if 'set_type' in processing %}
 set_type('field_name', type='number', constraints=dict(minimum=3)),  # There are quite a few options you can use here
                                                                      # Take a look at https://frictionlessdata.io/specs/table-schema/

Exemple #23

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

Exemple #24

0

Afficher le fichier

def flow(parameters, *_):
    def take_first(field):
        def f(row):
            if field in row and isinstance(row[field], list):
                row[field] = row[field][0]

        return Flow(
            f,
            set_type(field, type='string'),
        )

    def datetime_to_date(field):
        def f(row):
            if row.get(field):
                row[field] = row[field].date()

        return Flow(
            f,
            set_type(field, type='date'),
        )

    def approve(parameters):
        def func(row):
            if parameters.get('filter-out') is None:
                return True
            bad_phrase = parameters['filter-out']
            for f in ('page_title', 'description'):
                if row.get(f) and bad_phrase in row[f]:
                    return False
            return True

        return func

    return Flow(
        fetcher(parameters),
        concatenate(dict(
            page_title=['Title'],
            publication_id=['ItemId'],
            tender_id=['ItemUniqueId'],
            publisher=['OfficeDesc'],
            start_date=['PublishDate'],
            claim_date=['LastDate'],
            decision=['StatusDesc'],
            description=['Description'],
            last_update_date=['UpdateDate'],
            base_url=['BaseUrl'],
            url_name=['UrlName'],
            tender_type_he=['PublicationTypeDesc'],
        ),
                    resources=-1),
        add_field('tender_type',
                  'string',
                  default=parameters['tender_type'],
                  resources=-1),
        take_first('publisher'),
        take_first('tender_type_he'),
        add_field('page_url',
                  'string',
                  default=lambda row:
                  'https://www.gov.il/he{base_url}{url_name}'.format(**row)),
        # delete_fields(['base_url', 'url_name']),
        filter_rows(approve(parameters)),
        set_type('publication_id', type='integer'),
        set_type('start_date', type='datetime', format=DATE_FMT),
        set_type('last_update_date', type='datetime', format=DATE_FMT),
        set_type('claim_date', type='datetime', format=DATE_FMT),
        datetime_to_date('last_update_date'),
        datetime_to_date('start_date'),
        set_primary_key(['publication_id', 'tender_type', 'tender_id']),
        dedup(),
        update_resource(-1, **parameters.pop('resource')),
        update_resource(-1, **{'dpp:streaming': True}),
        validate(),
    )