コード例 #1
0
def flow(*_):
    DF.Flow(
        DF.load(filename, name='welfare'),
        DF.add_field('activity_name', 'string',
                     lambda r: r['שם השירות (ציבורי)']),
        DF.filter_rows(lambda r: r['activity_name']),
        DF.add_field(
            'activity_description', 'array', lambda r:
            [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)']
             ]),
        DF.add_field(
            'history', 'array', lambda r: [
                dict(
                    year=2019,
                    unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(),
                    subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1].
                    strip(),
                    subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[
                        1].strip(),
                )
            ]), DF.add_field('target_audience', 'array',
                             splitter('אוכלוסייה')),
        DF.add_field('subject', 'array', splitter('תחום ההתערבות')),
        DF.add_field('intervention', 'array', splitter('אופן התערבות')),
        DF.select_fields(FIELDS),
        DF.add_field('publisher_name', 'string', 'משרד הרווחה'),
        DF.add_field('min_year', 'integer', 2019),
        DF.add_field('max_year', 'integer', 2019),
        DF.add_field('kind', 'string', 'gov_social_service'),
        DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(),
        DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process()
    return DF.Flow(
        DF.load('tmp/activities-welfare/datapackage.json'),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
コード例 #2
0
def broken_links_flow():
    return DF.Flow(
        *[
            DF.Flow(
                DF.load(URL_TEMPLATE.format(**c), name=c['name']),
                DF.add_field('__name',
                             'string',
                             c['name'],
                             resources=c['name']),
                DF.add_field('__title',
                             'string',
                             get_title(c['title']),
                             resources=c['name']),
            ) for c in configuration
        ],
        DF.add_field('urls', 'array', lambda r: RE.findall(str(r))),
        DF.add_field('link', 'string',
                     lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)),
        DF.concatenate(
            dict(
                name=['__name'],
                title=['__title'],
                link=[],
                urls=[],
            )),
        DF.add_field('url', 'string'),
        DF.add_field('error', 'string'),
        unwind(),
        DF.delete_fields(['urls']),
        DF.parallelize(check_broken(), 4),
        DF.filter_rows(lambda r: r['error'] is not None),
    )
コード例 #3
0
def main_stats(num_files, file_size, download_iterations, download_threads,
               output_dir, only_upload, only_download, **kwargs):
    download_report_filename, upload_report_filename = None, None
    for filename in glob(os.path.join(output_dir, '*.csv')):
        if '/download-report-' in filename:
            assert not download_report_filename
            download_report_filename = filename
        elif '/upload-report-' in filename:
            assert not upload_report_filename
            upload_report_filename = filename
    assert download_report_filename and upload_report_filename
    print('upload_report_filename', upload_report_filename)
    print('download_report_filename', download_report_filename)
    print("Generating upload stats...")
    upload_stats = defaultdict(int)
    df.Flow(df.load(upload_report_filename),
            stats_process_upload_rows(upload_stats)).process()
    print("Generating download stats...")
    download_stats = defaultdict(int)
    df.Flow(df.load(download_report_filename),
            stats_process_download_rows(download_stats, file_size)).process()
    print("Upload Stats")
    pprint(dict(upload_stats))
    print("Download Stats")
    pprint(dict(download_stats))
コード例 #4
0
def get_updated_sources():
    import requests
    from pyquery import PyQuery as pq
    URL = 'https://mr.gov.il/ilgstorefront/he/news/details/230920201036'
    sources = []

    page = pq(requests.get(URL).text)
    anchors = page.find('a')
    for anchor in anchors:
        anchor = pq(anchor)
        href = anchor.attr('href')
        if '.zip' in href:
            sources.append(href + '#.xlsx')
    sources = [
        DF.load(source,
                format='excel-xml',
                encoding='utf8',
                bytes_sample_size=0) for source in sources
    ]
    if len(sources) != 2:
        return DF.Flow(
            data_gov_il_resource.flow(tenders),
            data_gov_il_resource.flow(exemptions),
        )
    else:
        return DF.Flow(*sources)
コード例 #5
0
def extract_tags(field='tags', prefixes=None):
    if prefixes is not None:

        def remove_prefix(row):
            row['tags'] = [
                x for x in row['tags']
                if all(not x.startswith('{}_'.format(prefix))
                       for prefix in prefixes)
            ]

        def collect(rows):
            options = set()
            for row in rows:
                options.update(row[field])
                yield row
            print('OPTIONS FOR {}: {}'.format(field, sorted(options)))

        return DF.Flow(
            DF.add_field(
                field, 'array', lambda row: [
                    t.split('_', 1)[1] for t in row['tags'] if any(
                        t.startswith('{}_'.format(prefix))
                        for prefix in prefixes)
                ]), remove_prefix, collect)
    else:

        def verify_tags(row):
            for tag in row[field]:
                if '_' in tag:
                    print('Found prefix: {}'.format(tag))

        return DF.Flow(verify_tags, )
コード例 #6
0
def get_neighborhood_features():
    return DF.Flow(
        DF.load('neighborhoods.xlsx',
                name='stat-areas',
                deduplicate_headers=True),
        DF.add_field(
            'neighborhoods', 'array', lambda r:
            [v for k, v in r.items() if v and k.startswith('neighborhood')]),
        DF.add_field('geometry', 'object',
                     lambda r: geometries[r['stat-area']]),
        DF.concatenate(
            dict(stat_area=['stat-area'], neighborhoods=[], geometry=[])),
        DF.update_resource(-1, name='stat-areas'), unwind_neighborhoods(),
        DF.join_with_self(
            'stat-areas', ['neighborhood'],
            dict(
                neighborhood=None,
                stat_areas=dict(name='stat_area', aggregate='array'),
                geometries=dict(name='geometry', aggregate='array'),
            )),
        DF.add_field('geometry', 'object',
                     lambda r: unite_geometries(r['geometries'])),
        DF.delete_fields(['geometries']),
        DF.update_resource(-1, name='neighborhoods'),
        DF.add_field(
            'properties', 'object', lambda r: dict(
                x=3, title=r['neighborhood'], stat_areas=r['stat_areas'])),
        DF.delete_fields(['neighborhood', 'stat_areas']),
        DF.checkpoint('_cache_neighborhoods')).results()[0][0]
コード例 #7
0
def es_dumper(resource_name, revision, path):
    now = time.time()
    return DF.Flow(
        update_pk('doc_id'),
        DF.add_field('revision', 'integer', default=revision),
        DF.add_field('score', 'number', default=1),
        DF.add_field('create_timestamp', 'number', now),
        my_dump_to_es(indexes={
            'migdar__' + resource_name: [{
                'resource-name': resource_name,
                'revision': revision
            }]
        },
                      mapper_cls=BoostingMappingGenerator,
                      index_settings={'index.mapping.coerce': True},
                      elasticsearch_options=dict(timeout=60)),
        DF.dump_to_path('data/{}'.format(path)),
        collate(revision),
        my_dump_to_es(indexes={
            'migdar__docs': [{
                'resource-name': resource_name,
                'revision': revision
            }]
        },
                      mapper_cls=BoostingMappingGenerator,
                      index_settings={'index.mapping.coerce': True}),
        DF.update_resource(None, **{'dpp:streaming': True}),
        DF.printer(),
    )
コード例 #8
0
def flow(parameters, *_):
    return DF.Flow(
        get_all(),
        DF.set_type('report-date', type='date', format='%Y-%m-%dT%H:%M:%SZ'),
        DF.update_resource(-1, **parameters['target-resource']),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
コード例 #9
0
def category():

    CATEGORIES = {
        'אולם מופעים, היכל תרבות': 'תרבות',
        'אולם ספורט, התעמלות': 'ספורט',
        'איצטדיון': 'ספורט',
        'בית כנסת': 'דת',
        'בית ספר': 'חינוך',
        'בריכת שחייה ציבורית או עירונית': 'ספורט',
        'גן ילדים': 'חינוך',
        'טיפת חלב': 'בריאות',
        'לשכת רווחה של הרשות המקומית': 'רווחה',
        'מבנה עירייה': 'כלליים',
        'מגרש ספורט': 'ספורט',
        'מגרש ציבורי פנוי': 'כלליים',
        'מועדון נוער': 'קהילה',
        'מועדון קהילתי כולל מרכז צעירים': 'קהילה',
        'מועדון קשישים, מרכז לאזרחים ותיקים,מרכז יום לקשישים': 'קהילה',
        'מעון יום': 'חינוך',
        'מקווה טוהרה': 'דת',
        'מקלט': 'כלליים',
        'מרפאה': 'בריאות',
        'ספרייה': 'תרבות',
        'פנימייה, כפר נוער': 'חינוך',
    }

    def cat(row):
        row['category'] = CATEGORIES[row['kind']]

    return DF.Flow(DF.add_field('category', 'string'), cat)
コード例 #10
0
ファイル: geojson.py プロジェクト: hasadna/datacity-ckan-dgp
def process_resource(instance_name, package, resource,
                     package_extras_processed_res):
    lat_field = resource.get("geo_lat_field")
    lon_field = resource.get("geo_lon_field")
    features = []
    for row in DF.Flow(
            DF.load(resource['url'],
                    infer_strategy=DF.load.INFER_STRINGS)).results()[0][0]:
        properties = get_properties(row)
        lon, lat = get_lat_lon_values(row, lon_field, lat_field)
        if lon and lat:
            features.append(
                Feature(geometry=Point((lon, lat)), properties=properties))
    fc = FeatureCollection(features)
    with utils.tempdir() as tmpdir:
        with open(os.path.join(tmpdir, "data.geojson"), 'w') as f:
            geojson.dump(fc, f)
        with open(os.path.join(tmpdir, "data.geojson")) as f:
            ckan.resource_create(
                instance_name, {
                    'package_id': package['id'],
                    'description': resource['description'],
                    'format': 'GeoJSON',
                    'name': resource['name'].replace('.csv', '') + '.geojson',
                },
                files=[('upload', f)])
    common.update_package_extras(instance_name, package,
                                 package_extras_processed_res)
コード例 #11
0
def prepare_locations():
    prepare_addresses()
    return DF.Flow(
        DF.load('_cache_addresses/datapackage.json'),
        DF.add_field(
            'address', 'string', lambda r: '{} {}{}'.format(
                r['street_name'], r['house_number'], r['letter'] or '')),
        DF.add_field(
            'item', 'object',
            lambda r: dict(value=dict(lat=float(r['lat']),
                                      lon=float(r['lon']),
                                      arnona_zones=r['arnona_zones'],
                                      שם=r['address']),
                           display=r['address'])),
        DF.sort_rows('{house_number}'),
        DF.delete_fields([
            'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address'
        ]),
        DF.join_with_self(
            'concat', ['street_name'],
            dict(display=dict(name='street_name'),
                 items=dict(name='item', aggregate='array'))),
        DF.add_field('sort_street_address', 'string',
                     lambda r: sort_street_address(r['display'])),
        DF.sort_rows('{sort_street_address}'),
        DF.delete_fields(['sort_street_address']), DF.printer(),
        DF.dump_to_path('_cache_locations'),
        DF.checkpoint('_cache_locations')).results()[0][0]
コード例 #12
0
def test_basic_flow_no_mapping_type():

    data = [
        dict(key='key%04d' % x, value=x)
        for x in range(1000)
    ]

    conn_str = 'localhost:9200'

    DF.Flow(
        data,
        DF.update_resource(-1, name='data'),
        DF.set_primary_key(['key']),
        dump_to_es(
            engine=conn_str,
            indexes=dict(
                test_basic_flow_no_mapping_type=[dict(
                    resource_name='data'
                )]
            )
        ),
    ).process()

    time.sleep(1)
    out = list(Storage(Elasticsearch(hosts=[conn_str])).read('test_basic_flow_no_mapping_type'))
    assert data == sorted(out, key=lambda r: r['key'])
コード例 #13
0
def prepare():
    for resource_name, load in loads:
        DF.Flow(
            load,
            # DF.printer(tablefmt='html'),
            DF.concatenate(
                FIELD_MAPPING,
                dict(name=resource_name, path=resource_name + '.csv')),
            DF.set_type('activity_name',
                        type='string',
                        constraints=dict(required=True),
                        on_error=DF.schema_validator.drop),
            DF.set_type('allocated_budget',
                        type='number',
                        groupChar=',',
                        bareNumber=False),
            DF.set_type('num_beneficiaries',
                        type='number',
                        groupChar=',',
                        bareNumber=False,
                        on_error=DF.schema_validator.ignore),
            fix_beneficiaries,
            DF.set_type('num_beneficiaries', type='string'),
            multiply_budget,
            fill_org_hierarchy,
            # DF.printer(tablefmt='html'),
            DF.dump_to_path('tmp/' + resource_name),
        ).process()
コード例 #14
0
def datarecords(kind):
    return map(
        lambda r: r['value'],
        DF.Flow(
            DF.load(f'https://data-input.obudget.org/api/datarecords/{kind}',
                    format='json',
                    property='result')).results()[0][0])
コード例 #15
0
def flow(*_):
    return DF.Flow(
        scraper(),
        DF.update_resource(-1,
                           name='joint',
                           path='joint.csv',
                           **{'dpp:streaming': True}))
コード例 #16
0
    def func(rows):
        filename = 'קטלוג רווחה למערכת ההזנה 18.5.21.xlsx'
        cats = DF.Flow(
            DF.load(filename, name='welfare'),
            # DF.printer(),
            DF.rename_fields(
                {
                    'id': 'catalog_number',
                    'שם השירות (ציבורי)': 'name'
                },
                regex=False),
            DF.select_fields(['catalog_number', 'name']),
        ).results()[0][0]
        cats = dict((k.pop('name'), k) for k in cats)

        missing = []
        for row in rows:
            v = row['value']
            if v['office'] == 'משרד הרווחה':
                name = v['name']
                if name in cats:
                    rec = cats.pop(name)
                    cn = str(rec['catalog_number'])
                    if v.get('catalog_number') != cn:
                        v['catalog_number'] = cn
                        yield row
                else:
                    missing.append((name, v['id']))
        for x in missing:
            print(
                '{} (https://data-input.obudget.org/he/datarecords/social_service/{})'
                .format(*x))
コード例 #17
0
def flow(*_, path='data/datasets_in_es'):
    return DF.Flow(
        DF.load('{}/datapackage.json'.format(path)),
        do_screenshot(),
        write_excel(),
        DF.update_resource(-1, **{'dpp:streaming': True})
    )
コード例 #18
0
def operator(name, params):
    connection_string = params['db_url']
    source_table = params['db_table']
    target_instance_name = params['target_instance_name']
    target_package_id = params['target_package_id']
    target_organization_id = params['target_organization_id']

    print('starting db_fetcher operator')
    print(
        'source_table={} target_instance_name={} target_package_id={} target_organization_id={}'
        .format(source_table, target_instance_name, target_package_id,
                target_organization_id))
    with tempfile.TemporaryDirectory() as tempdir:
        csv_filename = target_package_id + '.csv'
        DF.Flow(
            DF.load(connection_string,
                    table=source_table,
                    name=target_package_id,
                    infer_strategy=DF.load.INFER_PYTHON_TYPES),
            DF.update_resource(-1, path=csv_filename),
            DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process()
        csv_filename = os.path.join(tempdir, csv_filename)
        print('{}, {:,} bytes'.format(csv_filename,
                                      os.stat(csv_filename).st_size))
        update_package(target_instance_name, target_organization_id,
                       target_package_id, target_package_id,
                       [('CSV', csv_filename)])
コード例 #19
0
def flow(*_):
    global gcd
    gcd = google_chrome_driver(wait=False)
    return DF.Flow(
        scraper(gcd),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        finalize_teardown(gcd),
    )
コード例 #20
0
 def proj():
     def func(row):
         row['lon'], row['lat'] = projector(row['X'], row['Y'], inverse=True)
     return DF.Flow(
         DF.add_field('lon', 'number'),
         DF.add_field('lat', 'number'),
         func,
         DF.delete_fields(['X', 'Y'])
     )
コード例 #21
0
 def add_source():
     def f(rows):
         for row in rows:
             row['source'] = rows.res.name
             yield row
     return DF.Flow(
         DF.add_field('source', 'string'),
         f
     )
コード例 #22
0
def flow(*_):
    return DF.Flow(
        DF.load('/var/datapackages/activities/social_services/datapackage.json'),
        DF.add_field('entity_id', 'string'),
        DF.add_field('soproc_supplier', 'boolean'),
        unwind(),
        DF.select_fields(['entity_id', 'soproc_supplier']),
        DF.dump_to_path('/var/datapackages/activities/social_services_suppliers'),
    )
コード例 #23
0
def unwind_neighborhoods():
    def f(rows):
        for row in rows:
            for n in row['neighborhoods']:
                row['neighborhood'] = n
                yield row

    return DF.Flow(DF.add_field('neighborhood', 'string'), f,
                   DF.delete_fields(['neighborhoods']))
コード例 #24
0
def flow(*_):
    return DF.Flow(
        [
            dict(office=office, kind=kind) for office in offices
            for kind in report_kinds
        ],
        do_query(),
        DF.printer(),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
コード例 #25
0
def flow(*_):
    global engine
    engine = create_engine(os.environ['DPP_DB_ENGINE'])
    return DF.Flow(
        DF.add_field('charts',
                     'array',
                     default=[],
                     **{
                         'es:itemType': 'object',
                         'es:index': False
                     }), fetch_extra_data)
コード例 #26
0
def flow(parameters, *_):
    return DF.Flow(
        *[
            DF.load(x, format='csv', name='res%d' % i,
                    infer_strategy=DF.load.INFER_STRINGS,
                    cast_strategy=DF.load.CAST_DO_NOTHING)
            for i, x
            in enumerate(wrapper(parameters['year']))
        ],
        DF.update_resource(None, **{'dpp:streaming': True})
    )
コード例 #27
0
def geo():
    def proj(row):
        row['lon'], row['lat'] = projector(row['lon'],
                                           row['lat'],
                                           inverse=True)

    return DF.Flow(
        proj,
        DF.set_type('lon', type='number'),
        DF.set_type('lat', type='number'),
    )
コード例 #28
0
def lang_flow(lang, prefix):

    tags = [dict(doc_id=list(k)) for k in sorted(set(
            (prefix, x['hebrew'], x[lang])
            for x in translations['tags'].values()
        ))]

    def add_url(prefix_):
        def func(rows):
            for row in rows:
                if 'url' not in row:
                    yield row
                elif row.get('doc_id'):
                    row['url'] = 'https://yodaat.org/{}item/{}'.format(prefix_, row['doc_id'])
                    yield row
                else:
                    print('MMMMMMMM MISSING DOC ID', row)

        return DF.Flow(
            DF.add_field('url', 'string', resources=-1),
            func,
        )

    return DF.Flow(
        *[
            DF.Flow(
                DF.load('https://api.yodaat.org/data/{}_in_es/data/{}.csv'.format(x, y), name='{}-{}'.format(x, lang)),
                add_url(prefix)
            )
            for x, y in [
                ('publications', 'publications'),
                ('orgs', 'orgs'),
                ('datasets', 'out')
            ]
        ],
        tags,
        DF.add_field('url', 'string',
                     lambda row: 'https://yodaat.org/{}search?tag={}&itag={}&kind=all&filters={{}}&sortOrder=-year'.format(*row.get('doc_id')),
                     resources=-1),
        DF.update_resource(-1, name='tags-{}'.format(lang)),
    )
コード例 #29
0
def flow(*_):
    return DF.Flow(
        *[
            DF.load(x,
                    encoding='windows-1255',
                    format='csv',
                    name='res%d' % i,
                    quoting=csv.QUOTE_NONE,
                    infer_strategy=DF.load.INFER_STRINGS,
                    cast_strategy=DF.load.CAST_DO_NOTHING)
            for i, x in enumerate(wrapper())
        ], DF.update_resource(None, **{'dpp:streaming': True}))
コード例 #30
0
def flow(*_, ):
    return DF.Flow(
        scrape(),
        DF.update_resource(-1, **{
            'name': 'class_action',
            'dpp:streaming': True
        }),
        DF.set_type('claim_date',
                    type='datetime',
                    format='%d/%m/%Y',
                    resources=-1),
        calculate_publication_id(8),
    )