Example #1
0
def update_package(instance_name, org_id, package_name, title, resources):
    print("Creating/updating package {}@{} {}".format(package_name, org_id,
                                                      title))

    package = ckan.package_show(instance_name, package_name)
    if not package or package['state'] == 'deleted':
        res = ckan.package_create(
            instance_name, {
                'name': package_name,
                'title': title,
                'private': False,
                'owner_org': org_id
            })
        if res['success']:
            package = ckan.package_show(instance_name, package_name)
        else:
            print('Failed to create package', res)
    print(package)
    if package:
        existing_resources = package.get('resources', [])
        existing_resources = dict(
            (r['format'], r['id']) for r in existing_resources)
        print(existing_resources)
        for format, filename in resources:
            print(format, filename)
            with open(filename, 'rb') as f:
                resource = {
                    'package_id': package['id'],
                    'description': '{} - {}'.format(title, format),
                    'format': format,
                    'name': format,
                }
                if format in existing_resources:
                    print('Updating resource', existing_resources[format])
                    resource['id'] = existing_resources[format]
                    res = ckan.resource_update(instance_name,
                                               resource,
                                               files=[('upload', f)])
                    if not res['success']:
                        print('update resource failed: {}'.format(res))
                    else:
                        print('updated resource {} {}: {}'.format(
                            package_name, format, res))
                else:
                    print('Creating resource', resource)
                    res = ckan.resource_create(instance_name,
                                               resource,
                                               files=[('upload', f)])
                    if not res['success']:
                        print('create resource failed: {}'.format(res))
                    else:
                        print('created resource {} {}: {}'.format(
                            package_name, format, res))
Example #2
0
def update_package_extras(instance_name, package,
                          package_extras_processed_res):
    package = ckan.package_show(instance_name, package['id'])
    package.setdefault('extras', []).append({
        "key": package_extras_processed_res,
        "value": "yes"
    })
    ckan.package_update(instance_name, package)
Example #3
0
def _create_new_package(source_instance_name, source_package,
                        target_instance_name, target_organization_id,
                        target_package_name, target_package_title_prefix,
                        stats):
    print("Creating new package ({} > {})".format(source_package['name'],
                                                  target_package_name))
    _, source_instance_url = ckan.get_instance_api_key_url(
        source_instance_name)
    source_package_url = source_instance_url.strip('/') + '/dataset/{}'.format(
        source_package['name'])
    source_org_description = source_package.get('organization',
                                                {}).get('description', '')
    target_package_title = source_package['title']
    if target_package_title_prefix:
        target_package_title = '{} {}'.format(target_package_title_prefix,
                                              target_package_title)
    with _download_active_resources(source_package) as (tmpdir,
                                                        resource_hashes):
        res = ckan.package_create(
            target_instance_name, {
                'name':
                target_package_name,
                'title':
                target_package_title,
                'private':
                True,
                'license_id':
                source_package['license_id'],
                'notes':
                source_package['notes'],
                'url':
                source_package['url'],
                'version':
                source_package['version'],
                'owner_org':
                target_organization_id,
                'extras': [{
                    "key": "sync_source_package_url",
                    "value": source_package_url
                }, {
                    "key": "sync_source_org_description",
                    "value": source_org_description
                }]
            })
        assert res['success'], 'create package failed: {}'.format(res)
        target_package_id = res['result']['id']
        _create_resources(source_package, tmpdir, target_instance_name,
                          target_package_id, resource_hashes)
        package = ckan.package_show(target_instance_name, target_package_id)
        package['private'] = False
        res = ckan.package_update(target_instance_name, package)
        assert res['success'], 'failed to set package to public: {}'.format(
            res)
    stats['packages_new_created'] += 1
Example #4
0
def operator(name, params):
    source_instance_name = params['source_instance_name']
    target_instance_name = params['target_instance_name']
    target_organization_id = params['target_organization_id']
    target_package_name_prefix = params['target_package_name_prefix']
    target_package_title_prefix = params['target_package_title_prefix']
    print('starting ckan_sync operator')
    print(
        'source_instance_name={} target_instance_name={} target_organization_id={} target_package_name_prefix={} target_package_title_prefix={}'
        .format(source_instance_name, target_instance_name,
                target_organization_id, target_package_name_prefix,
                target_package_title_prefix))
    stats = defaultdict(int)
    for source_package_name in ckan.package_list_public(source_instance_name):
        source_package = None
        try:
            if source_package_name.startswith(target_package_name_prefix):
                stats['source_packages_invalid_prefix'] += 1
                continue
            source_package = ckan.package_show_public(source_instance_name,
                                                      source_package_name)
            if source_package['private'] or source_package[
                    'state'] != 'active' or source_package['type'] != 'dataset':
                stats['source_packages_invalid_attrs'] += 1
                continue
            stats['source_packages_valid'] += 1
            target_package_name = '{}{}'.format(target_package_name_prefix,
                                                source_package_name)
            target_existing_package = ckan.package_show(
                target_instance_name, target_package_name)
            if target_existing_package and target_existing_package[
                    'state'] != 'deleted':
                _update_existing_package(source_instance_name, source_package,
                                         target_instance_name,
                                         target_organization_id,
                                         target_package_name,
                                         target_existing_package,
                                         target_package_title_prefix, stats)
            else:
                _create_new_package(source_instance_name, source_package,
                                    target_instance_name,
                                    target_organization_id,
                                    target_package_name,
                                    target_package_title_prefix, stats)
            if stats['source_packages_valid'] % 10 == 0:
                print(dict(stats))
        except Exception:
            traceback.print_exc()
            print('exception processing source package {}: {}'.format(
                source_package_name, source_package))
            stats['source_packages_exceptions'] += 1
    print(dict(stats))
    return stats['source_packages_exceptions'] == 0
Example #5
0
def process_package(instance_name, package_id, task_id,
                    is_resource_valid_for_processing, process_resource):
    package = ckan.package_show(instance_name, package_id)
    for resource in package['resources']:
        if is_resource_valid_for_processing(instance_name, package, resource):
            package_extras_processed_res = "processed_res_{}_{}".format(
                task_id, resource['id'])
            package_extras = {
                e['key']: e['value']
                for e in package.get('extras', [])
            }
            if package_extras.get(package_extras_processed_res) == "yes":
                print("Already processed {} ({} > {} > {} {})".format(
                    task_id, instance_name, package['name'], resource['name'],
                    resource['id']))
            else:
                print("Starting {} processing ({} > {} > {} {})".format(
                    task_id, instance_name, package['name'], resource['name'],
                    resource['id']))
                process_resource(instance_name, package, resource,
                                 package_extras_processed_res)
                print("OK")
def init_package(instance_name, package, muni_filter_texts):
    existing_package, created_package_res = ckan.package_show(
        instance_name, package['id']), None
    if not existing_package:
        res = ckan.package_search(
            package['ckan']['url'],
            {'q': 'name:{}'.format(package['ckan']['package_id'])})
        assert res['count'] == 1, res
        source_package = res['results'][0]
        source_resource = None
        for resource in source_package['resources']:
            if resource['id'] == package['ckan']['resource_id']:
                source_resource = resource
                break
        if not source_resource:
            print(
                "WARNING! Using first resource instead of the specified resource"
            )
            source_resource = source_package['resources'][0]
        try:
            resource_name = source_resource['name']
            resource_url = source_resource['url']
            if "://e.data.gov.il" in resource_url:
                resource_url = resource_url.replace("://e.data.gov.il",
                                                    "://data.gov.il")
        except:
            print("Failed to get metadata from source resource")
            print(source_resource)
            raise
        resource_description = source_resource.get('description')
        # resource_last_modified = source_resource.get('last_modified')
        with utils.tempdir(keep=False) as tmpdir:
            resource_filename = resource_url.split("/")[-1]
            if resource_filename.startswith("."):
                resource_filename = "data{}".format(resource_filename)
            headers = {}
            if "://data.gov.il" in resource_url:
                headers['User-Agent'] = 'datagov-external-client'
            utils.http_stream_download(os.path.join(tmpdir, resource_filename),
                                       {
                                           "url": resource_url,
                                           "headers": headers
                                       })
            num_filtered_rows = muni_resource_filter(
                os.path.join(tmpdir, resource_filename),
                os.path.join(tmpdir, "muni_filtered"),
                package.get('muni_filter_column'), muni_filter_texts,
                package.get('geo_wgs_parsing'),
                package.get('muni_filter_column_in'))
            if num_filtered_rows > 0:
                dirnames = list(
                    glob(os.path.join(tmpdir, "muni_filtered", "*.csv")))
                assert len(dirnames) == 1
                muni_filtered_csv_filename = dirnames[0]
                package_description = source_package.get('notes')
                source_package_url = os.path.join(
                    package['ckan']['url'], 'dataset',
                    package['ckan']['package_id'])
                source_package_note = "מקור המידע: " + source_package_url
                package_description = source_package_note if not package_description else package_description + "\n\n" + source_package_note
                res = ckan.package_create(
                    instance_name, {
                        'name':
                        package['id'],
                        'title':
                        '{} | {}'.format(package['title'],
                                         package['source_title']),
                        'private':
                        True,
                        'license_id':
                        source_package.get('license_id'),
                        'notes':
                        package_description,
                        'url':
                        source_package.get('url'),
                        'version':
                        source_package.get('version'),
                        'owner_org':
                        DEFAULT_ORGANIZATION_ID,
                        'extras': [{
                            "key":
                            "sync_source_package_url",
                            "value":
                            package['ckan']['url'].strip('/') +
                            '/dataset/{}'.format(package['ckan']['package_id'])
                        }, {
                            'key':
                            'sync_source_org_description',
                            'value':
                            source_package.get('organization',
                                               {}).get('description')
                        }]
                    })
                assert res['success'], 'create package failed: {}'.format(res)
                target_package_id = res['result']['id']
                with open(muni_filtered_csv_filename) as f:
                    res = ckan.resource_create(
                        instance_name, {
                            'package_id':
                            target_package_id,
                            'description':
                            resource_description,
                            'format':
                            "CSV",
                            'name':
                            resource_name,
                            'url':
                            os.path.basename(muni_filtered_csv_filename),
                            **({
                                "geo_lat_field":
                                package['geo_wgs_parsing']["output_field_lat"],
                                "geo_lon_field":
                                package['geo_wgs_parsing']["output_field_lon"],
                            } if package.get('geo_wgs_parsing') else {}),
                        },
                        files={
                            'upload':
                            (os.path.basename(muni_filtered_csv_filename), f)
                        })
                assert res['success'], 'create resource failed: {}'.format(res)
                created_package = ckan.package_show(instance_name,
                                                    target_package_id)
                created_package['private'] = False
                created_package_res = ckan.package_update(
                    instance_name, created_package)
                assert created_package_res[
                    'success'], 'failed to set package to public: {}'.format(
                        created_package_res)
            else:
                print("no rows after muni filter")
    else:
        print("Package already exists ({})".format(existing_package['id']))
    created_package = created_package_res[
        'result'] if created_package_res else existing_package
    created_package_group_names = [
        g['name'] for g in created_package.get('groups', [])
    ]
    created_package['groups'] = [{
        "name": name
    } for name in created_package_group_names]
    num_added_groups = 0
    for group_name in package.get('groups', []):
        if group_name not in created_package_group_names:
            created_package['groups'].append({'name': group_name})
            num_added_groups += 1
    res = ckan.package_update(instance_name, created_package)
    assert res[
        'success'], 'failed to add groups to created package: {}'.format(res)
    print("Added {} groups".format(num_added_groups))
Example #7
0
def _update_existing_package(source_instance_name, source_package,
                             target_instance_name, target_organization_id,
                             target_package_name, target_existing_package,
                             target_package_title_prefix, stats):
    try:
        stats['packages_existing'] += 1
        package = {**target_existing_package}
        has_package_changes = False
        for attr in ['license_id', 'notes', 'url', 'version']:
            if package.get(attr) != source_package.get(attr):
                has_package_changes = True
                package[attr] = source_package.get(attr) or ''
        _, source_instance_url = ckan.get_instance_api_key_url(
            source_instance_name)
        source_package_url = source_instance_url.strip(
            '/') + '/dataset/{}'.format(source_package['name'])
        source_org_description = source_package.get('organization',
                                                    {}).get('description', '')
        new_target_package_title = source_package['title']
        if target_package_title_prefix:
            new_target_package_title = '{} {}'.format(
                target_package_title_prefix, new_target_package_title)
        if package['title'] != new_target_package_title:
            has_package_changes = True
            package['title'] = new_target_package_title
        got_extra_url, got_extra_description = False, False
        for extra in (package.get('extras') or []):
            if extra["key"] == "sync_source_package_url":
                got_extra_url = True
                if extra["value"] != source_package_url:
                    has_package_changes = True
                    extra["value"] = source_package_url
            elif extra["key"] == "sync_source_org_description":
                got_extra_description = True
                if extra["value"] != source_org_description:
                    has_package_changes = True
                    extra["value"] = source_org_description
        if not got_extra_url:
            has_package_changes = True
            package.setdefault('extras', []).append({
                "key": "sync_source_package_url",
                "value": source_package_url
            })
        if not got_extra_description:
            has_package_changes = True
            package.setdefault('extras', []).append({
                "key":
                "sync_source_org_description",
                "value":
                source_org_description
            })
        with _download_active_resources(source_package) as (
                tmpdir, source_resource_hashes):
            target_resource_hashes = {}
            for i, resource in enumerate(package['resources']):
                if resource['state'] != 'active':
                    continue
                target_resource_hashes[i] = resource['hash']
            has_resource_changes = len(source_resource_hashes) != len(
                target_resource_hashes)
            if not has_resource_changes:
                for i in source_resource_hashes:
                    if (source_resource_hashes[i]
                            or '') != (target_resource_hashes.get(i) or ''):
                        has_resource_changes = True
                    else:
                        try:
                            source_resource = source_package['resources'][i]
                            target_resource = target_existing_package[
                                'resources'][i]
                        except AttributeError:
                            source_resource = None
                            target_resource = None
                        if not source_resource or not target_resource:
                            has_resource_changes = True
                        else:
                            for attr in ['description', 'format', 'name']:
                                if source_resource.get(
                                        attr) != target_resource.get(attr):
                                    has_resource_changes = True
            if has_package_changes and not has_resource_changes:
                print('updating package, no resource changes ({} > {})'.format(
                    source_package['name'], target_package_name))
                ckan.package_update(target_instance_name, package)
                stats['packages_existing_only_package_changes'] += 1
            elif has_resource_changes:
                print(
                    'updating package, with resource changes ({} > {})'.format(
                        source_package['name'], target_package_name))
                package['private'] = True
                package['resources'] = []
                ckan.package_update(target_instance_name, package)
                _create_resources(source_package, tmpdir, target_instance_name,
                                  package['id'], source_resource_hashes)
                package = ckan.package_show(target_instance_name,
                                            target_package_name)
                package['private'] = False
                ckan.package_update(target_instance_name, package)
                stats['packages_existing_has_resource_changes'] += 1
            else:
                stats['packages_existing_no_changes'] += 1
    except Exception:
        print('exception updating existing package {}: {}'.format(
            target_package_name, target_existing_package))
        raise