Ejemplo n.º 1
0
def _create_resources(source_package, tmpdir, target_instance_name,
                      target_package_id, resource_hashes):
    for i, resource in enumerate(source_package['resources']):
        if resource['state'] != 'active':
            continue
        try:
            resource_filename = resource['url'].split(
                '/')[-1] if resource_hashes[i] else None
            resource_kwargs = {
                'package_id': target_package_id,
                'description': resource['description'],
                'format': resource['format'],
                'name': resource['name'],
            }
            if resource_filename:
                with open(os.path.join(tmpdir, 'resource{}'.format(i)),
                          'rb') as f:
                    res = ckan.resource_create(
                        target_instance_name, {
                            **resource_kwargs, 'url': resource_filename,
                            'hash': resource_hashes[i]
                        },
                        files={'upload': (resource_filename, f)})
            else:
                res = ckan.resource_create(target_instance_name,
                                           resource_kwargs)
            assert res['success'], 'create resource {} failed: {}'.format(
                i, res)
        except Exception as e:
            print('Failed to process resource {}: {}'.format(i, resource))
            raise
Ejemplo n.º 2
0
def process_resource(instance_name, package, resource,
                     package_extras_processed_res):
    lat_field = resource.get("geo_lat_field")
    lon_field = resource.get("geo_lon_field")
    features = []
    for row in DF.Flow(
            DF.load(resource['url'],
                    infer_strategy=DF.load.INFER_STRINGS)).results()[0][0]:
        properties = get_properties(row)
        lon, lat = get_lat_lon_values(row, lon_field, lat_field)
        if lon and lat:
            features.append(
                Feature(geometry=Point((lon, lat)), properties=properties))
    fc = FeatureCollection(features)
    with utils.tempdir() as tmpdir:
        with open(os.path.join(tmpdir, "data.geojson"), 'w') as f:
            geojson.dump(fc, f)
        with open(os.path.join(tmpdir, "data.geojson")) as f:
            ckan.resource_create(
                instance_name, {
                    'package_id': package['id'],
                    'description': resource['description'],
                    'format': 'GeoJSON',
                    'name': resource['name'].replace('.csv', '') + '.geojson',
                },
                files=[('upload', f)])
    common.update_package_extras(instance_name, package,
                                 package_extras_processed_res)
Ejemplo n.º 3
0
def update_package(instance_name, org_id, package_name, title, resources):
    print("Creating/updating package {}@{} {}".format(package_name, org_id,
                                                      title))

    package = ckan.package_show(instance_name, package_name)
    if not package or package['state'] == 'deleted':
        res = ckan.package_create(
            instance_name, {
                'name': package_name,
                'title': title,
                'private': False,
                'owner_org': org_id
            })
        if res['success']:
            package = ckan.package_show(instance_name, package_name)
        else:
            print('Failed to create package', res)
    print(package)
    if package:
        existing_resources = package.get('resources', [])
        existing_resources = dict(
            (r['format'], r['id']) for r in existing_resources)
        print(existing_resources)
        for format, filename in resources:
            print(format, filename)
            with open(filename, 'rb') as f:
                resource = {
                    'package_id': package['id'],
                    'description': '{} - {}'.format(title, format),
                    'format': format,
                    'name': format,
                }
                if format in existing_resources:
                    print('Updating resource', existing_resources[format])
                    resource['id'] = existing_resources[format]
                    res = ckan.resource_update(instance_name,
                                               resource,
                                               files=[('upload', f)])
                    if not res['success']:
                        print('update resource failed: {}'.format(res))
                    else:
                        print('updated resource {} {}: {}'.format(
                            package_name, format, res))
                else:
                    print('Creating resource', resource)
                    res = ckan.resource_create(instance_name,
                                               resource,
                                               files=[('upload', f)])
                    if not res['success']:
                        print('create resource failed: {}'.format(res))
                    else:
                        print('created resource {} {}: {}'.format(
                            package_name, format, res))
Ejemplo n.º 4
0
def process_resource(instance_name, package, resource,
                     package_extras_processed_res):
    with utils.tempdir() as tmpdir:
        DF.Flow(
            DF.load(resource['url'], infer_strategy=DF.load.INFER_STRINGS),
            dataflows_xlsx.dump_to_path(os.path.join(tmpdir),
                                        format='xlsx')).process()
        filenames = list(glob(os.path.join(tmpdir, "*.xlsx")))
        assert len(filenames) == 1
        filename = filenames[0]
        with open(filename, "rb") as f:
            ckan.resource_create(
                instance_name, {
                    'package_id': package['id'],
                    'description': resource['description'],
                    'format': 'XLSX',
                    'name': resource['name'].replace('.csv', '') + '.xlsx',
                },
                files=[('upload', f)])
    common.update_package_extras(instance_name, package,
                                 package_extras_processed_res)
def init_package(instance_name, package, muni_filter_texts):
    existing_package, created_package_res = ckan.package_show(
        instance_name, package['id']), None
    if not existing_package:
        res = ckan.package_search(
            package['ckan']['url'],
            {'q': 'name:{}'.format(package['ckan']['package_id'])})
        assert res['count'] == 1, res
        source_package = res['results'][0]
        source_resource = None
        for resource in source_package['resources']:
            if resource['id'] == package['ckan']['resource_id']:
                source_resource = resource
                break
        if not source_resource:
            print(
                "WARNING! Using first resource instead of the specified resource"
            )
            source_resource = source_package['resources'][0]
        try:
            resource_name = source_resource['name']
            resource_url = source_resource['url']
            if "://e.data.gov.il" in resource_url:
                resource_url = resource_url.replace("://e.data.gov.il",
                                                    "://data.gov.il")
        except:
            print("Failed to get metadata from source resource")
            print(source_resource)
            raise
        resource_description = source_resource.get('description')
        # resource_last_modified = source_resource.get('last_modified')
        with utils.tempdir(keep=False) as tmpdir:
            resource_filename = resource_url.split("/")[-1]
            if resource_filename.startswith("."):
                resource_filename = "data{}".format(resource_filename)
            headers = {}
            if "://data.gov.il" in resource_url:
                headers['User-Agent'] = 'datagov-external-client'
            utils.http_stream_download(os.path.join(tmpdir, resource_filename),
                                       {
                                           "url": resource_url,
                                           "headers": headers
                                       })
            num_filtered_rows = muni_resource_filter(
                os.path.join(tmpdir, resource_filename),
                os.path.join(tmpdir, "muni_filtered"),
                package.get('muni_filter_column'), muni_filter_texts,
                package.get('geo_wgs_parsing'),
                package.get('muni_filter_column_in'))
            if num_filtered_rows > 0:
                dirnames = list(
                    glob(os.path.join(tmpdir, "muni_filtered", "*.csv")))
                assert len(dirnames) == 1
                muni_filtered_csv_filename = dirnames[0]
                package_description = source_package.get('notes')
                source_package_url = os.path.join(
                    package['ckan']['url'], 'dataset',
                    package['ckan']['package_id'])
                source_package_note = "מקור המידע: " + source_package_url
                package_description = source_package_note if not package_description else package_description + "\n\n" + source_package_note
                res = ckan.package_create(
                    instance_name, {
                        'name':
                        package['id'],
                        'title':
                        '{} | {}'.format(package['title'],
                                         package['source_title']),
                        'private':
                        True,
                        'license_id':
                        source_package.get('license_id'),
                        'notes':
                        package_description,
                        'url':
                        source_package.get('url'),
                        'version':
                        source_package.get('version'),
                        'owner_org':
                        DEFAULT_ORGANIZATION_ID,
                        'extras': [{
                            "key":
                            "sync_source_package_url",
                            "value":
                            package['ckan']['url'].strip('/') +
                            '/dataset/{}'.format(package['ckan']['package_id'])
                        }, {
                            'key':
                            'sync_source_org_description',
                            'value':
                            source_package.get('organization',
                                               {}).get('description')
                        }]
                    })
                assert res['success'], 'create package failed: {}'.format(res)
                target_package_id = res['result']['id']
                with open(muni_filtered_csv_filename) as f:
                    res = ckan.resource_create(
                        instance_name, {
                            'package_id':
                            target_package_id,
                            'description':
                            resource_description,
                            'format':
                            "CSV",
                            'name':
                            resource_name,
                            'url':
                            os.path.basename(muni_filtered_csv_filename),
                            **({
                                "geo_lat_field":
                                package['geo_wgs_parsing']["output_field_lat"],
                                "geo_lon_field":
                                package['geo_wgs_parsing']["output_field_lon"],
                            } if package.get('geo_wgs_parsing') else {}),
                        },
                        files={
                            'upload':
                            (os.path.basename(muni_filtered_csv_filename), f)
                        })
                assert res['success'], 'create resource failed: {}'.format(res)
                created_package = ckan.package_show(instance_name,
                                                    target_package_id)
                created_package['private'] = False
                created_package_res = ckan.package_update(
                    instance_name, created_package)
                assert created_package_res[
                    'success'], 'failed to set package to public: {}'.format(
                        created_package_res)
            else:
                print("no rows after muni filter")
    else:
        print("Package already exists ({})".format(existing_package['id']))
    created_package = created_package_res[
        'result'] if created_package_res else existing_package
    created_package_group_names = [
        g['name'] for g in created_package.get('groups', [])
    ]
    created_package['groups'] = [{
        "name": name
    } for name in created_package_group_names]
    num_added_groups = 0
    for group_name in package.get('groups', []):
        if group_name not in created_package_group_names:
            created_package['groups'].append({'name': group_name})
            num_added_groups += 1
    res = ckan.package_update(instance_name, created_package)
    assert res[
        'success'], 'failed to add groups to created package: {}'.format(res)
    print("Added {} groups".format(num_added_groups))