def flow(*args):
    is_dpp = len(args) > 3
    return Flow(
        load('data/unique_records_full/datapackage.json',
             resources=['unique_records']),
        load('data/app_records_full/datapackage.json',
             resources=['search_app_records']),
        add_field('__revision', 'integer', REVISION),
        *(add_field(f['name'], f['type']) for f in STATUS_FIELDS),
        manage_revisions,
        *(dump_to_sql(
            {
                DB_TABLE: {
                    'resource-name': resource_name,
                    'mode': 'update',
                    'update_keys': KEY_FIELDS
                }
            }, DATAFLOWS_DB_ENGINE)
          for resource_name in ['unique_records', 'search_app_records']),
        *(add_field(f'rev_{name}', 'date')
          for name in ['last_updated_at', 'last_modified_at', 'created_at']),
        set_revisions,
        filter_rows(equals=[{
            '__next_update_days': FILTER_NEXT_UPDATE_DAYS
        }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(),
        dump_to_path('data/publications_for_es'),
        printer(tablefmt='plain' if is_dpp else 'html',
                num_rows=1,
                fields=['doc_id']),
        update_resource(None, **{'dpp:streaming': True}))
Ejemplo n.º 2
0
def flow(*_):
    return DF.Flow(
        services(),
        DF.delete_fields(
            ['__tab', 'complete', 'non_suppliers', 'non_tenders', 'notes']),
        DF.add_field('publisher_name', 'string', lambda r: r['office'],
                     **{'es:keyword': True}),
        splitter('target_audience'),
        splitter('subject'),
        splitter('intervention'),
        splitter('target_age_group'),
        floater('beneficiaries'),
        floater('budgetItems'),
        floater('manualBudget'),
        floater('tenders'),
        floater('suppliers'),
        floater('virtue_of_table'),
        fix_suppliers(),
        fix_tenders(),
        add_current_budget(),
        add_current_beneficiaries(),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', CURRENT_YEAR),
        DF.add_field('kind', 'string', 'gov_social_service', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירות חברתי', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.set_type('name', **{'es:title': True}),
        DF.set_type('description', **{
            'es:itemType': 'string',
            'es:boost': True
        }),
        DF.add_field('score', 'number', get_score, **{'es:score-column':
                                                      True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='activities', **{'dpp:streaming': True}),
        DF.dump_to_sql(dict(all_activities={'resource-name': 'activities'})),
        DF.filter_rows(lambda r: not r['deleted']),
        DF.delete_fields(['deleted']),
        DF.dump_to_path('/var/datapackages/activities/social_services'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})),
    )
Ejemplo n.º 3
0
    def flow(self):
        taxonomy = self.context.taxonomy
        txn_config = taxonomy.config
        fmt_str = [taxonomy.title + ' עבור:']
        fields = txn_config['key-fields']
        for f in fields:
            for ct in taxonomy.column_types:
                if ct['name'] == f:
                    fmt_str.append('%s: "{%s}",' %
                                   (ct['title'], f.replace(':', '-')))
                    break
        fmt_str = ' '.join(fmt_str)
        fields = [ct.replace(':', '-') for ct in fields]
        all_fields = ['_source'] + fields

        TARGET = 'configurations'
        saved_config = self.config._unflatten()
        saved_config.setdefault('publish', {})['allowed'] = False

        return Flow(
            duplicate(RESOURCE_NAME, TARGET),
            join_with_self(
                TARGET,
                all_fields,
                dict((f, {}) for f in all_fields),
            ),
            add_computed_field([
                dict(operation='format', target='snippets', with_=fmt_str),
                dict(operation='constant', target='key_values', with_=None),
            ],
                               resources=TARGET),
            add_field('config', 'object', saved_config, resources=TARGET),
            add_field('fields',
                      type='object',
                      default=self.collate_values(fields),
                      resources=TARGET),
            join_with_self(
                TARGET, ['_source'],
                dict(
                    source=dict(name='_source'),
                    config={},
                    key_values=dict(aggregate='array'),
                    snippets=dict(aggregate='array'),
                )),
            set_type('source', type='string'),
            set_type('config', type='object'),
            set_type('key_values', type='array'),
            set_type('snippets', type='array'),
            set_primary_key(['source']),
            dump_to_sql(
                dict([(TARGET, {
                    'resource-name': TARGET,
                    'mode': 'update'
                })]),
                engine=self.lazy_engine(),
            ),
        )
def flow(*_):
    return DF.Flow(
        DF.load(
            '/var/datapackages/activities/social_services/social_services/datapackage.json'
        ),
        DF.concatenate(
            dict(kind=[],
                 kind_he=[],
                 activity_name=[],
                 activity_description=[],
                 publisher_name=[],
                 history=[],
                 max_year=[],
                 min_year=[]), dict(name='activities', path='activities.csv')),
        DF.set_primary_key(['kind', 'publisher_name', 'activity_name']),
        DF.set_type('activity_name', **{'es:title': True}),
        DF.set_type('activity_description', **{
            'es:itemType': 'string',
            'es:boost': True
        }), DF.set_type('kind', **{
            'es:keyword': True,
            'es:exclude': True
        }), DF.set_type('kind_he', **{
            'es:keyword': True,
            'es:exclude': True
        }), DF.set_type('publisher_name', **{'es:keyword': True}),
        DF.set_type(
            'history', **{
                'es:itemType':
                'object',
                'es:schema':
                dict(fields=[
                    dict(name='year', type='integer'),
                    dict(name='unit', type='string'),
                    dict(name='subunit', type='string'),
                    dict(name='subsubunit', type='string'),
                    dict(name='allocated_budget', type='integer'),
                    dict(name='num_beneficiaries',
                         type='string',
                         **{'es:index': False}),
                ])
            }),
        DF.add_field(
            'score', 'number', lambda x:
            (x['history'][0].get('allocated_budget') or 1000) / 1000,
            **{'es:score-column': True}),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        DF.dump_to_path('/var/datapackages/activities/all'),
        DF.dump_to_sql(dict(activities={'resource-name': 'activities'})))
Ejemplo n.º 5
0
def test_dump_to_sql():
    from dataflows import Flow, printer, dump_to_sql
    from sqlalchemy import create_engine

    f = Flow(
        data, printer(),
        dump_to_sql(dict(output_table={'resource-name': 'res_1'}),
                    engine='sqlite:///out/test.db'))
    f.process()

    # Check validity
    engine = create_engine('sqlite:///out/test.db')
    result = list(
        dict(x) for x in engine.execute('select * from output_table'))
    assert result == data
def flow(*_):
    return DF.Flow(
        all_units(),
        DF.add_field('office', 'string', lambda r: r['path'][0]
                     if len(r['path']) > 0 else None, **{'es:keyword': True}),
        DF.add_field('unit', 'string', lambda r: r['path'][1]
                     if len(r['path']) > 1 else None, **{'es:keyword': True}),
        DF.add_field('subunit', 'string', lambda r: r['path'][2]
                     if len(r['path']) > 2 else None, **{'es:keyword': True}),
        DF.add_field('subsubunit', 'string', lambda r: r['path'][3]
                     if len(r['path']) > 3 else None, **{'es:keyword': True}),
        DF.add_field('breadcrumbs', 'string',
                     lambda r: '/'.join(r['path']) or 'משרדי הממשלה',
                     **{'es:exclude': True}),
        DF.add_field('id', 'string', lambda r: '__'.join(r['path']) or 'main',
                     **{'es:exclude': True}),
        DF.delete_fields([
            'path',
        ]),
        DF.add_field('min_year', 'integer', 2020),
        DF.add_field('max_year', 'integer', 2020),
        DF.add_field('kind', 'string', 'gov_social_service_unit', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('kind_he', 'string', 'שירותים חברתיים במיקור חוץ', **{
            'es:keyword': True,
            'es:exclude': True
        }),
        DF.add_field('score', 'number', 1000, **{'es:score-column': True}),
        DF.set_primary_key(['kind', 'id']),
        DF.update_resource(-1, name='units', **{'dpp:streaming': True}),

        # Ensure we only have the main offices
        DF.filter_rows(lambda r: r['unit'] is None),
        DF.filter_rows(lambda r: r['office'] != 'משרד העליה והקליטה'),
        DF.dump_to_path('/var/datapackages/units/social_services'),
        DF.dump_to_sql(dict(units={'resource-name': 'units'})))
Ejemplo n.º 7
0
def flow(parameters):
    return Flow(
        dump_to_sql(parameters['tables'],
                    engine=parameters.get('engine', 'env://DPP_DB_ENGINE'),
                    updated_column=parameters.get("updated_column"),
                    updated_id_column=parameters.get("updated_id_column")))