def update_package(instance_name, org_id, package_name, title, resources): print("Creating/updating package {}@{} {}".format(package_name, org_id, title)) package = ckan.package_show(instance_name, package_name) if not package or package['state'] == 'deleted': res = ckan.package_create( instance_name, { 'name': package_name, 'title': title, 'private': False, 'owner_org': org_id }) if res['success']: package = ckan.package_show(instance_name, package_name) else: print('Failed to create package', res) print(package) if package: existing_resources = package.get('resources', []) existing_resources = dict( (r['format'], r['id']) for r in existing_resources) print(existing_resources) for format, filename in resources: print(format, filename) with open(filename, 'rb') as f: resource = { 'package_id': package['id'], 'description': '{} - {}'.format(title, format), 'format': format, 'name': format, } if format in existing_resources: print('Updating resource', existing_resources[format]) resource['id'] = existing_resources[format] res = ckan.resource_update(instance_name, resource, files=[('upload', f)]) if not res['success']: print('update resource failed: {}'.format(res)) else: print('updated resource {} {}: {}'.format( package_name, format, res)) else: print('Creating resource', resource) res = ckan.resource_create(instance_name, resource, files=[('upload', f)]) if not res['success']: print('create resource failed: {}'.format(res)) else: print('created resource {} {}: {}'.format( package_name, format, res))
def update_package_extras(instance_name, package, package_extras_processed_res): package = ckan.package_show(instance_name, package['id']) package.setdefault('extras', []).append({ "key": package_extras_processed_res, "value": "yes" }) ckan.package_update(instance_name, package)
def _create_new_package(source_instance_name, source_package, target_instance_name, target_organization_id, target_package_name, target_package_title_prefix, stats): print("Creating new package ({} > {})".format(source_package['name'], target_package_name)) _, source_instance_url = ckan.get_instance_api_key_url( source_instance_name) source_package_url = source_instance_url.strip('/') + '/dataset/{}'.format( source_package['name']) source_org_description = source_package.get('organization', {}).get('description', '') target_package_title = source_package['title'] if target_package_title_prefix: target_package_title = '{} {}'.format(target_package_title_prefix, target_package_title) with _download_active_resources(source_package) as (tmpdir, resource_hashes): res = ckan.package_create( target_instance_name, { 'name': target_package_name, 'title': target_package_title, 'private': True, 'license_id': source_package['license_id'], 'notes': source_package['notes'], 'url': source_package['url'], 'version': source_package['version'], 'owner_org': target_organization_id, 'extras': [{ "key": "sync_source_package_url", "value": source_package_url }, { "key": "sync_source_org_description", "value": source_org_description }] }) assert res['success'], 'create package failed: {}'.format(res) target_package_id = res['result']['id'] _create_resources(source_package, tmpdir, target_instance_name, target_package_id, resource_hashes) package = ckan.package_show(target_instance_name, target_package_id) package['private'] = False res = ckan.package_update(target_instance_name, package) assert res['success'], 'failed to set package to public: {}'.format( res) stats['packages_new_created'] += 1
def operator(name, params): source_instance_name = params['source_instance_name'] target_instance_name = params['target_instance_name'] target_organization_id = params['target_organization_id'] target_package_name_prefix = params['target_package_name_prefix'] target_package_title_prefix = params['target_package_title_prefix'] print('starting ckan_sync operator') print( 'source_instance_name={} target_instance_name={} target_organization_id={} target_package_name_prefix={} target_package_title_prefix={}' .format(source_instance_name, target_instance_name, target_organization_id, target_package_name_prefix, target_package_title_prefix)) stats = defaultdict(int) for source_package_name in ckan.package_list_public(source_instance_name): source_package = None try: if source_package_name.startswith(target_package_name_prefix): stats['source_packages_invalid_prefix'] += 1 continue source_package = ckan.package_show_public(source_instance_name, source_package_name) if source_package['private'] or source_package[ 'state'] != 'active' or source_package['type'] != 'dataset': stats['source_packages_invalid_attrs'] += 1 continue stats['source_packages_valid'] += 1 target_package_name = '{}{}'.format(target_package_name_prefix, source_package_name) target_existing_package = ckan.package_show( target_instance_name, target_package_name) if target_existing_package and target_existing_package[ 'state'] != 'deleted': _update_existing_package(source_instance_name, source_package, target_instance_name, target_organization_id, target_package_name, target_existing_package, target_package_title_prefix, stats) else: _create_new_package(source_instance_name, source_package, target_instance_name, target_organization_id, target_package_name, target_package_title_prefix, stats) if stats['source_packages_valid'] % 10 == 0: print(dict(stats)) except Exception: traceback.print_exc() print('exception processing source package {}: {}'.format( source_package_name, source_package)) stats['source_packages_exceptions'] += 1 print(dict(stats)) return stats['source_packages_exceptions'] == 0
def process_package(instance_name, package_id, task_id, is_resource_valid_for_processing, process_resource): package = ckan.package_show(instance_name, package_id) for resource in package['resources']: if is_resource_valid_for_processing(instance_name, package, resource): package_extras_processed_res = "processed_res_{}_{}".format( task_id, resource['id']) package_extras = { e['key']: e['value'] for e in package.get('extras', []) } if package_extras.get(package_extras_processed_res) == "yes": print("Already processed {} ({} > {} > {} {})".format( task_id, instance_name, package['name'], resource['name'], resource['id'])) else: print("Starting {} processing ({} > {} > {} {})".format( task_id, instance_name, package['name'], resource['name'], resource['id'])) process_resource(instance_name, package, resource, package_extras_processed_res) print("OK")
def init_package(instance_name, package, muni_filter_texts): existing_package, created_package_res = ckan.package_show( instance_name, package['id']), None if not existing_package: res = ckan.package_search( package['ckan']['url'], {'q': 'name:{}'.format(package['ckan']['package_id'])}) assert res['count'] == 1, res source_package = res['results'][0] source_resource = None for resource in source_package['resources']: if resource['id'] == package['ckan']['resource_id']: source_resource = resource break if not source_resource: print( "WARNING! Using first resource instead of the specified resource" ) source_resource = source_package['resources'][0] try: resource_name = source_resource['name'] resource_url = source_resource['url'] if "://e.data.gov.il" in resource_url: resource_url = resource_url.replace("://e.data.gov.il", "://data.gov.il") except: print("Failed to get metadata from source resource") print(source_resource) raise resource_description = source_resource.get('description') # resource_last_modified = source_resource.get('last_modified') with utils.tempdir(keep=False) as tmpdir: resource_filename = resource_url.split("/")[-1] if resource_filename.startswith("."): resource_filename = "data{}".format(resource_filename) headers = {} if "://data.gov.il" in resource_url: headers['User-Agent'] = 'datagov-external-client' utils.http_stream_download(os.path.join(tmpdir, resource_filename), { "url": resource_url, "headers": headers }) num_filtered_rows = muni_resource_filter( os.path.join(tmpdir, resource_filename), os.path.join(tmpdir, "muni_filtered"), package.get('muni_filter_column'), muni_filter_texts, package.get('geo_wgs_parsing'), package.get('muni_filter_column_in')) if num_filtered_rows > 0: dirnames = list( glob(os.path.join(tmpdir, "muni_filtered", "*.csv"))) assert len(dirnames) == 1 muni_filtered_csv_filename = dirnames[0] package_description = source_package.get('notes') source_package_url = os.path.join( package['ckan']['url'], 'dataset', package['ckan']['package_id']) source_package_note = "מקור המידע: " + source_package_url package_description = source_package_note if not package_description else package_description + "\n\n" + source_package_note res = ckan.package_create( instance_name, { 'name': package['id'], 'title': '{} | {}'.format(package['title'], package['source_title']), 'private': True, 'license_id': source_package.get('license_id'), 'notes': package_description, 'url': source_package.get('url'), 'version': source_package.get('version'), 'owner_org': DEFAULT_ORGANIZATION_ID, 'extras': [{ "key": "sync_source_package_url", "value": package['ckan']['url'].strip('/') + '/dataset/{}'.format(package['ckan']['package_id']) }, { 'key': 'sync_source_org_description', 'value': source_package.get('organization', {}).get('description') }] }) assert res['success'], 'create package failed: {}'.format(res) target_package_id = res['result']['id'] with open(muni_filtered_csv_filename) as f: res = ckan.resource_create( instance_name, { 'package_id': target_package_id, 'description': resource_description, 'format': "CSV", 'name': resource_name, 'url': os.path.basename(muni_filtered_csv_filename), **({ "geo_lat_field": package['geo_wgs_parsing']["output_field_lat"], "geo_lon_field": package['geo_wgs_parsing']["output_field_lon"], } if package.get('geo_wgs_parsing') else {}), }, files={ 'upload': (os.path.basename(muni_filtered_csv_filename), f) }) assert res['success'], 'create resource failed: {}'.format(res) created_package = ckan.package_show(instance_name, target_package_id) created_package['private'] = False created_package_res = ckan.package_update( instance_name, created_package) assert created_package_res[ 'success'], 'failed to set package to public: {}'.format( created_package_res) else: print("no rows after muni filter") else: print("Package already exists ({})".format(existing_package['id'])) created_package = created_package_res[ 'result'] if created_package_res else existing_package created_package_group_names = [ g['name'] for g in created_package.get('groups', []) ] created_package['groups'] = [{ "name": name } for name in created_package_group_names] num_added_groups = 0 for group_name in package.get('groups', []): if group_name not in created_package_group_names: created_package['groups'].append({'name': group_name}) num_added_groups += 1 res = ckan.package_update(instance_name, created_package) assert res[ 'success'], 'failed to add groups to created package: {}'.format(res) print("Added {} groups".format(num_added_groups))
def _update_existing_package(source_instance_name, source_package, target_instance_name, target_organization_id, target_package_name, target_existing_package, target_package_title_prefix, stats): try: stats['packages_existing'] += 1 package = {**target_existing_package} has_package_changes = False for attr in ['license_id', 'notes', 'url', 'version']: if package.get(attr) != source_package.get(attr): has_package_changes = True package[attr] = source_package.get(attr) or '' _, source_instance_url = ckan.get_instance_api_key_url( source_instance_name) source_package_url = source_instance_url.strip( '/') + '/dataset/{}'.format(source_package['name']) source_org_description = source_package.get('organization', {}).get('description', '') new_target_package_title = source_package['title'] if target_package_title_prefix: new_target_package_title = '{} {}'.format( target_package_title_prefix, new_target_package_title) if package['title'] != new_target_package_title: has_package_changes = True package['title'] = new_target_package_title got_extra_url, got_extra_description = False, False for extra in (package.get('extras') or []): if extra["key"] == "sync_source_package_url": got_extra_url = True if extra["value"] != source_package_url: has_package_changes = True extra["value"] = source_package_url elif extra["key"] == "sync_source_org_description": got_extra_description = True if extra["value"] != source_org_description: has_package_changes = True extra["value"] = source_org_description if not got_extra_url: has_package_changes = True package.setdefault('extras', []).append({ "key": "sync_source_package_url", "value": source_package_url }) if not got_extra_description: has_package_changes = True package.setdefault('extras', []).append({ "key": "sync_source_org_description", "value": source_org_description }) with _download_active_resources(source_package) as ( tmpdir, source_resource_hashes): target_resource_hashes = {} for i, resource in enumerate(package['resources']): if resource['state'] != 'active': continue target_resource_hashes[i] = resource['hash'] has_resource_changes = len(source_resource_hashes) != len( target_resource_hashes) if not has_resource_changes: for i in source_resource_hashes: if (source_resource_hashes[i] or '') != (target_resource_hashes.get(i) or ''): has_resource_changes = True else: try: source_resource = source_package['resources'][i] target_resource = target_existing_package[ 'resources'][i] except AttributeError: source_resource = None target_resource = None if not source_resource or not target_resource: has_resource_changes = True else: for attr in ['description', 'format', 'name']: if source_resource.get( attr) != target_resource.get(attr): has_resource_changes = True if has_package_changes and not has_resource_changes: print('updating package, no resource changes ({} > {})'.format( source_package['name'], target_package_name)) ckan.package_update(target_instance_name, package) stats['packages_existing_only_package_changes'] += 1 elif has_resource_changes: print( 'updating package, with resource changes ({} > {})'.format( source_package['name'], target_package_name)) package['private'] = True package['resources'] = [] ckan.package_update(target_instance_name, package) _create_resources(source_package, tmpdir, target_instance_name, package['id'], source_resource_hashes) package = ckan.package_show(target_instance_name, target_package_name) package['private'] = False ckan.package_update(target_instance_name, package) stats['packages_existing_has_resource_changes'] += 1 else: stats['packages_existing_no_changes'] += 1 except Exception: print('exception updating existing package {}: {}'.format( target_package_name, target_existing_package)) raise