Python join_with_self Exemples, dataflows.join_with_self Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : dgps.py Projet : OpenBudget/budgetkey-dgp-server

    def flow(self):
        taxonomy = self.context.taxonomy
        txn_config = taxonomy.config
        fmt_str = [taxonomy.title + ' עבור:']
        fields = txn_config['key-fields']
        for f in fields:
            for ct in taxonomy.column_types:
                if ct['name'] == f:
                    fmt_str.append('%s: "{%s}",' %
                                   (ct['title'], f.replace(':', '-')))
                    break
        fmt_str = ' '.join(fmt_str)
        fields = [ct.replace(':', '-') for ct in fields]
        all_fields = ['_source'] + fields

        TARGET = 'configurations'
        saved_config = self.config._unflatten()
        saved_config.setdefault('publish', {})['allowed'] = False

        return Flow(
            duplicate(RESOURCE_NAME, TARGET),
            join_with_self(
                TARGET,
                all_fields,
                dict((f, {}) for f in all_fields),
            ),
            add_computed_field([
                dict(operation='format', target='snippets', with_=fmt_str),
                dict(operation='constant', target='key_values', with_=None),
            ],
                               resources=TARGET),
            add_field('config', 'object', saved_config, resources=TARGET),
            add_field('fields',
                      type='object',
                      default=self.collate_values(fields),
                      resources=TARGET),
            join_with_self(
                TARGET, ['_source'],
                dict(
                    source=dict(name='_source'),
                    config={},
                    key_values=dict(aggregate='array'),
                    snippets=dict(aggregate='array'),
                )),
            set_type('source', type='string'),
            set_type('config', type='object'),
            set_type('key_values', type='array'),
            set_type('snippets', type='array'),
            set_primary_key(['source']),
            dump_to_sql(
                dict([(TARGET, {
                    'resource-name': TARGET,
                    'mode': 'update'
                })]),
                engine=self.lazy_engine(),
            ),
        )

Exemple #2

0

Afficher le fichier

Fichier : addresses.py Projet : hasadna/datacity-businessgate

def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]

Exemple #3

0

Afficher le fichier

Fichier : neighborhoods.py Projet : hasadna/datacity-businessgate

def get_neighborhood_features():
    return DF.Flow(
        DF.load('neighborhoods.xlsx',
                name='stat-areas',
                deduplicate_headers=True),
        DF.add_field(
            'neighborhoods', 'array', lambda r:
            [v for k, v in r.items() if v and k.startswith('neighborhood')]),
        DF.add_field('geometry', 'object',
                     lambda r: geometries[r['stat-area']]),
        DF.concatenate(
            dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])),
        DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(),
        DF.join_with_self(
            'stat-areas', ['neighborhood'],
            dict(
                neighborhood=None,
                stat_areas=dict(name='stat_area', aggregate='array'),
                geometries=dict(name='geometry', aggregate='array'),
            )),
        DF.add_field('geometry', 'object',
                     lambda r: unite_geometries(r['geometries'])),
        DF.delete_fields(['geometries']),
        DF.update_resource(-1, name='neighborhoods'),
        DF.add_field(
            'properties', 'object', lambda r: dict(
                x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])),
        DF.delete_fields(['neighborhood', 'stat_areas']),
        DF.checkpoint('_cache_neighborhoods')).results()[0][0]

Exemple #4

0

Afficher le fichier

Fichier : naama_scraper.py Projet : odedsh/budgetkey-data-pipelines

def flow(*_):
    return DF.Flow(
        get_updated_sources(),
        DF.concatenate(fields=TENDER_MAPPING, target=dict(name='tenders')),
        DF.validate(),
        DF.filter_rows(lambda r: r['publication_id']),
        DF.add_field('tender_type', 'string',
                     lambda r: TENDER_KINDS[r['tender_type_he']],
                     **{'es:keyword': True}),
        DF.join_with_self(
            'tenders', KEY,
            dict((k, dict(aggregate='last'))
                 for k in list(TENDER_MAPPING.keys()) + ['tender_type'])),
        DF.set_type('publication_id', type='string', transform=str),
        DF.set_type('supplier_id', type='string', transform=str),
        DF.set_type('tender_id',
                    type='string',
                    transform=lambda v: v or 'none'),
        DF.set_type('.+_date',
                    type='date',
                    format='%d.%m.%Y',
                    on_error=DF.schema_validator.clear),
        DF.set_type('subjects',
                    type='string',
                    transform=lambda v: ';'.join(x.strip()
                                                 for x in v.split(','))
                    if v else ''),
        DF.set_type('claim_date',
                    type='datetime',
                    transform=lambda v, field_name, row: datetime.datetime.
                    combine(v, row['claim_time'] or datetime.time(0))
                    if v else None),
        DF.set_type('tender_type_he', **{'es:keyword': True}),
        DF.delete_fields(['claim_time']),
        DF.add_field(
            'page_url', 'string', lambda r:
            f'https://mr.gov.il/ilgstorefront/he/p/{r["publication_id"]}'),
        DF.add_field('page_title', 'string', lambda r: r['description']),
        DF.add_field('reason', 'string', lambda r: r['regulation']),
        DF.add_field('documents', 'array', []),
        DF.add_field('contact', 'string'),
        DF.add_field('contact_email', 'string'),
        DF.validate(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

Exemple #5

0

Afficher le fichier

    def postflow(self):
        key_field_names = [
            ct.replace(':', '-') for ct in self.config.get(CONFIG_PRIMARY_KEY)
        ]

        def save_pks(saved_pk):
            def func(package: PackageWrapper):
                for res in package.pkg.descriptor['resources']:
                    if res['name'] == RESOURCE_NAME:
                        saved_pk['pk'] = res['schema'].get('primaryKey', [])
                yield package.pkg
                yield from package

            return func

        def restore_pks(saved_pk):
            def func(package: PackageWrapper):
                for res in package.pkg.descriptor['resources']:
                    if res['name'] == RESOURCE_NAME:
                        res['schema']['primaryKey'] = saved_pk['pk']
                yield package.pkg
                yield from package

            return func

        saved_pk = dict(pk=[])
        steps = [
            save_pks(saved_pk),
            sort_rows(self.ORDER_BY_KEY, resources=RESOURCE_NAME),
            join_with_self(
                RESOURCE_NAME, key_field_names, {
                    **dict((f, {}) for f in key_field_names), '*':
                    dict(aggregate='last')
                }),
            restore_pks(saved_pk)
        ]
        f = Flow(*steps)
        return f

Exemple #6

0

Afficher le fichier

Fichier : historic_data.py Projet : wsheffel/budgetkey-data-pipelines

def flow(*_):
    prepare()
    yearly_fields = [
        'year', 'unit', 'subunit', 'subsubunit', 'allocated_budget',
        'num_beneficiaries'
    ]
    return DF.Flow(
        *[
            DF.load('tmp/' + resource_name + '/datapackage.json')
            for resource_name, _ in loads
        ],
        DF.concatenate(
            FIELD_MAPPING,
            dict(name='social_services', path='social_services.csv')),
        DF.sort_rows('{year}', reverse=True),
        DF.add_field(
            'history', 'object', lambda r: dict(
                (k, r[k]
                 if not isinstance(r[k], decimal.Decimal) else int(r[k]))
                for k in yearly_fields)),
        DF.printer(),
        DF.join_with_self(
            'social_services', ['publisher_name', 'activity_name'],
            dict(
                publisher_name=None,
                activity_name=None,
                activity_description=dict(aggregate='set'),
                min_year=dict(name='year', aggregate='min'),
                max_year=dict(name='year', aggregate='max'),
                history=dict(aggregate='array'),
            )),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.printer(),
    )

Exemple #7

0

Afficher le fichier

def test_join():
    from dataflows import Flow, join, join_with_self, set_type, sort_rows
    from decimal import Decimal

    characters = [
        {
            'first_name': 'Jaime',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Tyrion',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 27
        },
        {
            'first_name': 'Cersei',
            'house': 'Lannister',
            'last_name': 'Lannister',
            'age': 34
        },
        {
            'first_name': 'Jon',
            'house': 'Stark',
            'last_name': 'Snow',
            'age': 17
        },
        {
            'first_name': 'Sansa',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 14
        },
        {
            'first_name': 'Rickon',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 5
        },
        {
            'first_name': 'Arya',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 11
        },
        {
            'first_name': 'Bran',
            'house': 'Stark',
            'last_name': 'Stark',
            'age': 10
        },
        {
            'first_name': 'Daenerys',
            'house': 'Targaryen',
            'last_name': 'Targaryen',
            'age': 16
        },
    ]

    houses = [
        {
            'house': 'House of Lannister'
        },
        {
            'house': 'House of Greyjoy'
        },
        {
            'house': 'House of Stark'
        },
        {
            'house': 'House of Targaryen'
        },
        {
            'house': 'House of Martell'
        },
        {
            'house': 'House of Tyrell'
        },
    ]

    res, _, _ = Flow(
        characters, set_type('age', type='number'), houses,
        join('res_1',
             'House of {house}',
             'res_2',
             '{house}',
             dict(max_age={
                 'name': 'age',
                 'aggregate': 'max'
             },
                  avg_age={
                      'name': 'age',
                      'aggregate': 'avg'
                  },
                  representative={
                      'name': 'first_name',
                      'aggregate': 'last'
                  },
                  representative_age={'name': 'age'},
                  number_of_characters={'aggregate': 'count'},
                  last_names={
                      'name': 'last_name',
                      'aggregate': 'counters'
                  }),
             full=False,
             source_delete=True)).results()

    assert res[0] == [
        {
            'avg_age': Decimal('31.66666666666666666666666667'),
            'house': 'House of Lannister',
            'max_age': Decimal(34),
            'number_of_characters': 3,
            'representative': 'Cersei',
            'representative_age': Decimal(34),
            'last_names': [('Lannister', 3)]
        },
        {
            'avg_age': Decimal('11.4'),
            'house': 'House of Stark',
            'max_age': Decimal(17),
            'number_of_characters': 5,
            'representative': 'Bran',
            'representative_age': Decimal(10),
            'last_names': [('Stark', 4), ('Snow', 1)]
        },
        {
            'avg_age': Decimal(16),
            'house': 'House of Targaryen',
            'max_age': Decimal(16),
            'number_of_characters': 1,
            'representative': 'Daenerys',
            'representative_age': Decimal(16),
            'last_names': [('Targaryen', 1)]
        },
    ]

    # Find youngest of each house
    res, _, _ = Flow(
        characters, set_type('age', type='number'), sort_rows('{age:02}'),
        join_with_self('res_1', '{house}', {
            'the_house': {
                'name': 'house'
            },
            '*': {
                'aggregate': 'first'
            },
        }), sort_rows('{the_house}')).results()

    assert res[0] == [{
        'the_house': 'Lannister',
        'first_name': 'Tyrion',
        'last_name': 'Lannister',
        'age': Decimal('27')
    }, {
        'the_house': 'Stark',
        'first_name': 'Rickon',
        'last_name': 'Stark',
        'age': Decimal('5')
    }, {
        'the_house': 'Targaryen',
        'first_name': 'Daenerys',
        'last_name': 'Targaryen',
        'age': Decimal('16')
    }]

Exemple #8

0

Afficher le fichier

Fichier : process.py Projet : uttaravadina/covid-19

 checkpoint('processed_data'),
 # Sort rows by date and country
 sort_rows('{Country/Region}{Province/State}{Date}',
           resources='time-series-19-covid-combined'),
 # Duplicate the stream to create aggregated data
 duplicate(source='time-series-19-covid-combined',
           target_name='worldwide-aggregated',
           target_path='data/worldwide-aggregated.csv'),
 join_with_self(resource_name='worldwide-aggregated',
                join_key=['Date'],
                fields=dict(Date={'name': 'Date'},
                            Confirmed={
                                'name': 'Confirmed',
                                'aggregate': 'sum'
                            },
                            Recovered={
                                'name': 'Recovered',
                                'aggregate': 'sum'
                            },
                            Deaths={
                                'name': 'Deaths',
                                'aggregate': 'sum'
                            })),
 update_schema('worldwide-aggregated',
               missingValues=['None', ''],
               fields=[{
                   "format": "%Y-%m-%d",
                   "name": "Date",
                   "type": "date"
               }, {
                   "format": "default",

Exemple #9

0

Afficher le fichier

if __name__ == '__main__':

    r, _, _ = DF.Flow(
        DF.load(all_data(), name='cities', headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field('score_date', 'object', lambda r: dict(
            date=r['date'].isoformat(), sr=float(r['symptoms_ratio_weighted'] or 0), nr=int(r['num_reports_weighted']))
        ),
        DF.concatenate(dict(
            id=[], city_name=[], score_date=[]
        ), target=dict(name='ranking')),
        DF.join_with_self('ranking', '{city_name}', dict(
            id=None, city_name=None, scores=dict(name='score_date', aggregate='array')
        )),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'][-1]['nr'] >= 200),
        DF.add_field('sortkey', 'integer', lambda r: int(r['scores'][-1]['sr'] * 1000000) + r['scores'][-1]['nr']),
        DF.sort_rows('{sortkey}', reverse=True),
        DF.delete_fields(['sortkey']),
        DF.add_field('rank', 'integer', 0),
        DF.add_field('translations', 'object', lambda r: city_translations[r['city_name']]),
        DF.add_field('image', 'object', lambda r: upload_static_image(r['id'], width=280*2, height=160*2)),
        ranker(),
    ).results()
    rankings = r[0]

    r, _, _ = DF.Flow(
        DF.load(all_data(), name='cities', headers=1,

Exemple #10

0

Afficher le fichier

                name='cities',
                headers=1,
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.filter_rows(lambda r: r['is_city']),
        DF.add_field(
            'score_date', 'object',
            lambda r: dict(weekday=r['date'].isoweekday() % 7,
                           date=r['date'].toordinal(),
                           sr=float(r['symptoms_ratio_weighted'] or 0),
                           nr=int(r['num_reports_weighted']))),
        DF.concatenate(dict(id=[], city_name=[], score_date=[]),
                       target=dict(name='popup_data')),
        DF.join_with_self(
            'popup_data', '{city_name}',
            dict(id=None,
                 city_name=None,
                 scores=dict(name='score_date', aggregate='array'))),
        sort_limit_scores(),
        DF.filter_rows(lambda r: r['scores'] is not None),
        DF.add_field('nr', 'integer', lambda r: r['scores'][-1]['nr']),
        DF.add_field('sr', 'number', lambda r: r['scores'][-1]['sr']),
        split_to_weeks(),
        DF.add_field('translations', 'object',
                     lambda r: city_translations[r['city_name']]),
    ).results()
    popup_data = r[0]
    popup_data = dict((x.pop('id'), x) for x in popup_data)

    upload_file(
        json.dumps(popup_data, cls=json_encoder, indent=2).encode('utf8'),

Exemple #11

0

Afficher le fichier

 # Duplicate the stream to create aggregated data
 duplicate(
     source="time-series-19-covid-combined",
     target_name="worldwide-aggregated",
     target_path="data/worldwide-aggregated.csv",
 ),
 join_with_self(
     resource_name="worldwide-aggregated",
     join_key=["Date"],
     fields=dict(
         Date={"name": "Date"},
         Confirmed={
             "name": "Confirmed",
             "aggregate": "sum"
         },
         Recovered={
             "name": "Recovered",
             "aggregate": "sum"
         },
         Deaths={
             "name": "Deaths",
             "aggregate": "sum"
         },
     ),
 ),
 printer(),
 update_schema(
     "worldwide-aggregated",
     missingValues=["None", ""],
     fields=[
         {

Exemple #12

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_stack_demand(stack):

    def collect_cats():
        F = 'כלל המדגם'
        
        def f(rows):
            cat = None
            for row in rows:
                if F in row:
                    v = row[F]
                    if v.startswith('סך הכל '):
                        cat = v[7:]
                    elif v.startswith('--- '):
                        if not v.endswith('ללא פירוט'):
                            subcat = v[4:]
                            row['category'] = cat
                            row['subcategory'] = subcat
                            yield row
                else:
                    yield row
        return DF.Flow(
            DF.add_field('category', 'string', resources=-1),
            DF.add_field('subcategory', 'string', resources=-1),
            f,
            DF.delete_fields([F], resources=-1),
        )

    def fix_nones(row):
        row['demand_pct'] = row['demand_pct'] or 0

    key = 'stack:demand'
    try:
        demand_stacks = _cache.get(key)
    except KeyError:        
        demand_stacks = DF.Flow(
            DF.load('demand.xlsx', infer_strategy=DF.load.INFER_STRINGS, headers=2),
            collect_cats(),
            DF.update_schema(-1, missingValues=['--']),
            DF.unpivot(
                unpivot_fields=[dict(
                    name='(.+) \\([A-Z]\\)',
                    keys=dict(
                        neighborhood='\\1'
                    ),
                )],
                extra_keys=[dict(
                    name='neighborhood', type='string'
                )],
                extra_value=dict(
                    name='demand_pct', type='number'
                ),
                resources=-1
            ),
            DF.validate(),
            DF.duplicate('demand', 'demand_stacks'),
            DF.join_with_self('demand', ['category', 'subcategory'], dict(
                category=None, subcategory=None, max_demand=dict(name='demand_pct', aggregate='max')
            )),
            DF.join(
                'demand', ['category', 'subcategory'],
                'demand_stacks', ['category', 'subcategory'],
                dict(
                    max_demand=None
                )
            ),
            fix_nones,
            DF.add_field('display', 'string', lambda r: '{:.0f}%'.format(r['demand_pct'] * 100)),
            DF.add_field('value', 'number', lambda r: r['demand_pct']),
            DF.add_field('score', 'number', lambda r: r['demand_pct'] / r['max_demand'] * 6),
            DF.delete_fields(['demand_pct', 'max_demand']),
            DF.sort_rows('{score}', reverse=True),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_display=r['display'],
                score_value=float(r['value']),
                geometry_score=float(r['score']),
            )),
            DF.join_with_self('demand_stacks', ['category', 'subcategory'], dict(
                category=None, subcategory=None,
                scores=dict(aggregate='array'),
            )),
            DF.add_field('card', 'object', lambda r: dict(
                title='ביקוש ל{}'.format(r['subcategory']),
                content='',
                scores=r['scores'],
                test='demand__{category}__{subcategory}'.format(**r).replace(' ', '_')
            )),
            DF.join_with_self('demand_stacks', ['category'], dict(
                category=None,
                cards=dict(name='card', aggregate='array'),
            )),
            DF.add_field('name', 'string', lambda r: 'demand.{}'.format(r['category']).replace(' ', '_')),
        ).results()[0][0]
        _cache.set(key, demand_stacks)
                    
    cards = [s for s in demand_stacks if s['name'] == stack['name']][0]['cards']
    stack.update(dict(
        layout='scores',
        currentField='neighborhood',
        map=True
    ))
    stack.setdefault('cards', []).extend(cards)

Exemple #13

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_institutions(stack):
    key = 'stack:institutions'
    try:
        institutions_cards = _cache.get(key)
    except KeyError:
        CRS = '+ellps=GRS80 +k=1.00007 +lat_0=31.73439361111111 +lon_0=35.20451694444445 +no_defs +proj=tmerc +units=m +x_0=219529.584 +y_0=626907.39'
        projector = pyproj.Proj(CRS)

        def proj():
            def func(row):
                row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True)
            return DF.Flow(
                DF.add_field('lon', 'number'),
                DF.add_field('lat', 'number'),
                func,
                DF.delete_fields(['X', 'Y'])
            )

        def translate_kind():
            translations = {
                'מרפאה': 'מרפאות',
                'איצטדיון': 'איצטדיון',
                'ספרייה': 'ספריות',
                'בית ספר': 'בתי ספר',
                'מועדון קהילתי כולל מרכז צעירים': 'מועדון קהילתי',
                'בית כנסת': 'בתי כנסת',
                'מועדון נוער': 'מועדון נוער',
                'אולם מופעים, היכל תרבות': 'מוסדות תרבות',
                'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'מרכזי פעילות לקשישים',
            }
            def func(row):
                row['kind'] = translations[row['kind']]
            return func

        institutions_cards = DF.Flow(
            *[
                DF.load(f)
                for f in glob.glob('institutions/*xlsx')
            ],
            DF.concatenate(dict(
                kind=['סוג המוסד'],
                title=['שם המוסד'],
                address=['כתובת'],
                X=[], Y=[]
            )),
            translate_kind(),
            proj(),
            DF.add_field('feature', 'object', 
                        lambda r: geojson.Feature(
                            properties=dict(title=r['title'], address=r['address']),
                            geometry=geojson.Point(coordinates=[float(r['lon']), float(r['lat'])])
                        )),
            DF.delete_fields(['title', 'lon', 'lat', 'address']),
            DF.join_with_self('concat', ['kind'], dict(
                title=dict(name='kind'),
                features=dict(name='feature', aggregate='array')
            )),
            DF.sort_rows('{title}', reverse=True),
            DF.add_field('pointGeometry', 'object', lambda r: geojson.FeatureCollection(features=r['features'])),
            DF.add_field('content', 'string', '&nbsp;'),
            DF.delete_fields(['features']),
        #     DF.printer(tablefmt='html')
        ).results()[0][0]
        _cache.set(key, institutions_cards)

    stack.update(dict(
        map=True,
    ))
    stack.setdefault('cards', [])
    current_cards = dict(
        (c['title'], c) for c in stack['cards']
    )
    for card in institutions_cards:
        current_card = current_cards.pop(card['title'], None)
        if current_card is not None:
            card['content'] = current_card['content']
        else:
            print('SPURIOUS CARD for INSTITUTIONS', card['title'])
    stack['cards'] = [
        c for c in stack['cards']
        if c['title'] in current_cards
    ] + institutions_cards

Exemple #14

0

Afficher le fichier

Fichier : prepare_stacks.py Projet : hasadna/datacity-businessgate

def process_demographics(stack):
    key = 'stack:demographics'
    try:
        demographics_cards = _cache.get(key)
    except KeyError:        
        def add_source():
            def f(rows):
                for row in rows:
                    row['source'] = rows.res.name
                    yield row
            return DF.Flow(
                DF.add_field('source', 'string'),
                f
            )

        def map_to_cards():
            MAP = {
                ("דו''ח אג''ס לפי עולים וותיקים",
                        ("סה''כ עולים",)
                ): 'immigrants',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('0-5', '6-12')
                ): 'kids',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('13-17',)
                ): 'teenagers',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('60-64', '65-69', '70-74', '75-120')
                ): 'elderly',
                ("דו''ח אג''ס לפי קבוצות גיל",
                        ('18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59')
                ): 'adults',
            }
            
            def f(rows):
                for row in rows:
                    for (source, kinds), kind in MAP.items():
                        if row['source'] == source and row['kind'] in kinds:
                            row['kind'] = kind
                            yield row
            return f

        s2n = dict(
            (int(stat_area), f['properties']['title'])
            for f in get_neighborhood_features()
            for stat_area in f['properties']['stat_areas']
        )

        MAP2 = dict(
            adults=('אוכלוסיה בוגרת', 'גברים ונשים בין גיל 18 ל-60', 0),
            kids=('ילדים', 'תינוקות וילדים עד גיל 12', 1),
            teenagers=('בני נוער', 'נערים ונערות עד גיל 18', 2),
            elderly=('הגיל השלישי', 'גברים ונשים מעל גיל 60', 3),
            immigrants=('עולים לישראל', 'תושבים שאינם ילידי ישראל', 4),
        )

        demographics_cards = DF.Flow(
            *[
                DF.load(f, headers=4)
                for f in glob.glob('demographics/*.csv')
            ],
            DF.add_field('stat_id', 'string', lambda r: r["אג''ס"]),
            DF.add_field('total', 'number', lambda r: r.get("סה''כ")),
            DF.delete_fields(["אג''ס", "סה''כ "]),
            DF.unpivot([dict(
                name="([-'א-ת0-9 ].+)",
                keys=dict(
                    kind=r'\1'
                )
            )], [dict(
                name='kind', type='string'
            )], dict(
                name='value', type='number'
            )),
            DF.validate(),
            add_source(),
            map_to_cards(),
            DF.concatenate(dict(
                total=[], value=[], kind=[], stat_id=[]
            )),
            DF.add_field('neighborhood', 'string', lambda r: s2n.get(int(r['stat_id']))),
            DF.filter_rows(lambda r: r['neighborhood']),
            DF.join_with_self('concat', ['neighborhood', 'kind'], dict(
                neighborhood=None,
                kind=None,
                total=dict(aggregate='sum'),
                value=dict(aggregate='sum'),
            )),
            DF.duplicate('concat', 'maxes'),
            DF.join_with_self('concat', ['neighborhood'], dict(neighborhood=None, total=None)),
            DF.join('concat', ['neighborhood'], 'maxes', ['neighborhood'], dict(
                total=None,
            )),
            DF.add_field('score_value', 'number', lambda r: r['value']), # /r['total']  
            DF.sort_rows('{score_value}', reverse=True),
            DF.duplicate('maxes', 'demographics'),
            DF.join_with_self('maxes', ['kind'], dict(kind=None, max_value=dict(name='score_value', aggregate='max'))),
            DF.join('maxes', ['kind'], 'demographics', ['kind'], dict(max_value=None)),
            DF.add_field('geometry_score', 'number', lambda r: 6*r['score_value']/r['max_value']),
            DF.add_field('score_display', 'string', lambda r: '{:,} ({:.0f}%)'.format(r['value'], 100*r['score_value']/r['total'])),
            DF.add_field('scores', 'object', lambda r: dict(
                title=r['neighborhood'],
                score_value=float(r['score_value']),
                score_display=r['score_display'],
                geometry_score=float(r['geometry_score']),
            )),
            DF.join_with_self('demographics', ['kind'], dict(
                kind=None, scores=dict(aggregate='array'),
            )),
            DF.add_field('title', 'string', lambda r: MAP2[r['kind']][0]),
            DF.add_field('content', 'string', lambda r: MAP2[r['kind']][1]),
            DF.add_field('order', 'integer', lambda r: MAP2[r['kind']][2]),
            DF.sort_rows('{order}'),
            DF.delete_fields(['kind']),
        ).results()[0][0]
        _cache.set(key, demographics_cards)

    # features = [
    #     dict(type='Feature', geometry=r['geometry'], properties=dict(title=r['neighborhoods'][0]))
    #     for r in DF.Flow(
    #         DF.load('geo/stat-areas/stat-areas/datapackage.json'),
    #     ).results()[0][0]
    # ]
    # geometry=dict(type='FeatureCollection', features=features)

    stack.update(dict(
        map=True,
        scheme='green',
        currentField='neighborhood',
        layout='scores',
        # geometry=geometry
    ))
    stack.setdefault('cards', []).extend(demographics_cards)

Exemple #15

0

Afficher le fichier

Fichier : map_coloring.py Projet : hasadna/avid-covider-raw-data

        DF.load(latest_file(),
                name='out',
                override_fields=dict(area_id=dict(type='string')),
                cast_strategy=DF.load.CAST_WITH_SCHEMA),
        DF.add_field('city_area_id', 'string',
                     lambda r: r['area_id'].split('-')[0]),
        DF.join('cities', ['city_area_id'], 'out', ['city_area_id'],
                dict(num_city_reports=dict(name='num_reports_weighted'))),
        DF.add_field('desc', 'string', ''),
        DF.add_field('kind', 'string', ''),
        DF.add_field('property', 'string', ''),
        props(),
        DF.join_with_self(
            'out', ['is_city', 'kind', 'desc', 'property'],
            dict(is_city=None,
                 kind=None,
                 desc=None,
                 property=None,
                 id=dict(aggregate='array'))),
    ).results()

    for item in r[0]:
        print('bucket for {} {} {}: {}'.format(
            'city' if item['is_city'] else 'neighborhood', item['kind'],
            item['desc'], len(item['id'])))
        if item['property'] is None:
            continue
        if item['kind'] == 'fill':
            if item['is_city']:
                city_fill_color_cases.extend(
                    [['in', ['get', 'id'], ['literal', item['id']]],

Exemple #16

0

Afficher le fichier

                   "title": "Cumulative total deaths to date",
                   "type": "integer"
               }]),
 checkpoint('processed_data'),
 # Duplicate the stream to create aggregated data
 duplicate(source='time-series-19-covid-combined',
           target_name='worldwide-aggregated',
           target_path='worldwide-aggregated.csv'),
 join_with_self(resource_name='worldwide-aggregated',
                join_key=['Date'],
                fields=dict(Date={'name': 'Date'},
                            Confirmed={
                                'name': 'Confirmed',
                                'aggregate': 'sum'
                            },
                            Recovered={
                                'name': 'Recovered',
                                'aggregate': 'sum'
                            },
                            Deaths={
                                'name': 'Deaths',
                                'aggregate': 'sum'
                            })),
 update_schema('worldwide-aggregated',
               fields=[{
                   "format": "%Y-%m-%d",
                   "name": "Date",
                   "type": "date"
               }, {
                   "format": "default",
                   "groupChar": "",