def _create_resources(source_package, tmpdir, target_instance_name, target_package_id, resource_hashes): for i, resource in enumerate(source_package['resources']): if resource['state'] != 'active': continue try: resource_filename = resource['url'].split( '/')[-1] if resource_hashes[i] else None resource_kwargs = { 'package_id': target_package_id, 'description': resource['description'], 'format': resource['format'], 'name': resource['name'], } if resource_filename: with open(os.path.join(tmpdir, 'resource{}'.format(i)), 'rb') as f: res = ckan.resource_create( target_instance_name, { **resource_kwargs, 'url': resource_filename, 'hash': resource_hashes[i] }, files={'upload': (resource_filename, f)}) else: res = ckan.resource_create(target_instance_name, resource_kwargs) assert res['success'], 'create resource {} failed: {}'.format( i, res) except Exception as e: print('Failed to process resource {}: {}'.format(i, resource)) raise
def process_resource(instance_name, package, resource, package_extras_processed_res): lat_field = resource.get("geo_lat_field") lon_field = resource.get("geo_lon_field") features = [] for row in DF.Flow( DF.load(resource['url'], infer_strategy=DF.load.INFER_STRINGS)).results()[0][0]: properties = get_properties(row) lon, lat = get_lat_lon_values(row, lon_field, lat_field) if lon and lat: features.append( Feature(geometry=Point((lon, lat)), properties=properties)) fc = FeatureCollection(features) with utils.tempdir() as tmpdir: with open(os.path.join(tmpdir, "data.geojson"), 'w') as f: geojson.dump(fc, f) with open(os.path.join(tmpdir, "data.geojson")) as f: ckan.resource_create( instance_name, { 'package_id': package['id'], 'description': resource['description'], 'format': 'GeoJSON', 'name': resource['name'].replace('.csv', '') + '.geojson', }, files=[('upload', f)]) common.update_package_extras(instance_name, package, package_extras_processed_res)
def update_package(instance_name, org_id, package_name, title, resources): print("Creating/updating package {}@{} {}".format(package_name, org_id, title)) package = ckan.package_show(instance_name, package_name) if not package or package['state'] == 'deleted': res = ckan.package_create( instance_name, { 'name': package_name, 'title': title, 'private': False, 'owner_org': org_id }) if res['success']: package = ckan.package_show(instance_name, package_name) else: print('Failed to create package', res) print(package) if package: existing_resources = package.get('resources', []) existing_resources = dict( (r['format'], r['id']) for r in existing_resources) print(existing_resources) for format, filename in resources: print(format, filename) with open(filename, 'rb') as f: resource = { 'package_id': package['id'], 'description': '{} - {}'.format(title, format), 'format': format, 'name': format, } if format in existing_resources: print('Updating resource', existing_resources[format]) resource['id'] = existing_resources[format] res = ckan.resource_update(instance_name, resource, files=[('upload', f)]) if not res['success']: print('update resource failed: {}'.format(res)) else: print('updated resource {} {}: {}'.format( package_name, format, res)) else: print('Creating resource', resource) res = ckan.resource_create(instance_name, resource, files=[('upload', f)]) if not res['success']: print('create resource failed: {}'.format(res)) else: print('created resource {} {}: {}'.format( package_name, format, res))
def process_resource(instance_name, package, resource, package_extras_processed_res): with utils.tempdir() as tmpdir: DF.Flow( DF.load(resource['url'], infer_strategy=DF.load.INFER_STRINGS), dataflows_xlsx.dump_to_path(os.path.join(tmpdir), format='xlsx')).process() filenames = list(glob(os.path.join(tmpdir, "*.xlsx"))) assert len(filenames) == 1 filename = filenames[0] with open(filename, "rb") as f: ckan.resource_create( instance_name, { 'package_id': package['id'], 'description': resource['description'], 'format': 'XLSX', 'name': resource['name'].replace('.csv', '') + '.xlsx', }, files=[('upload', f)]) common.update_package_extras(instance_name, package, package_extras_processed_res)
def init_package(instance_name, package, muni_filter_texts): existing_package, created_package_res = ckan.package_show( instance_name, package['id']), None if not existing_package: res = ckan.package_search( package['ckan']['url'], {'q': 'name:{}'.format(package['ckan']['package_id'])}) assert res['count'] == 1, res source_package = res['results'][0] source_resource = None for resource in source_package['resources']: if resource['id'] == package['ckan']['resource_id']: source_resource = resource break if not source_resource: print( "WARNING! Using first resource instead of the specified resource" ) source_resource = source_package['resources'][0] try: resource_name = source_resource['name'] resource_url = source_resource['url'] if "://e.data.gov.il" in resource_url: resource_url = resource_url.replace("://e.data.gov.il", "://data.gov.il") except: print("Failed to get metadata from source resource") print(source_resource) raise resource_description = source_resource.get('description') # resource_last_modified = source_resource.get('last_modified') with utils.tempdir(keep=False) as tmpdir: resource_filename = resource_url.split("/")[-1] if resource_filename.startswith("."): resource_filename = "data{}".format(resource_filename) headers = {} if "://data.gov.il" in resource_url: headers['User-Agent'] = 'datagov-external-client' utils.http_stream_download(os.path.join(tmpdir, resource_filename), { "url": resource_url, "headers": headers }) num_filtered_rows = muni_resource_filter( os.path.join(tmpdir, resource_filename), os.path.join(tmpdir, "muni_filtered"), package.get('muni_filter_column'), muni_filter_texts, package.get('geo_wgs_parsing'), package.get('muni_filter_column_in')) if num_filtered_rows > 0: dirnames = list( glob(os.path.join(tmpdir, "muni_filtered", "*.csv"))) assert len(dirnames) == 1 muni_filtered_csv_filename = dirnames[0] package_description = source_package.get('notes') source_package_url = os.path.join( package['ckan']['url'], 'dataset', package['ckan']['package_id']) source_package_note = "מקור המידע: " + source_package_url package_description = source_package_note if not package_description else package_description + "\n\n" + source_package_note res = ckan.package_create( instance_name, { 'name': package['id'], 'title': '{} | {}'.format(package['title'], package['source_title']), 'private': True, 'license_id': source_package.get('license_id'), 'notes': package_description, 'url': source_package.get('url'), 'version': source_package.get('version'), 'owner_org': DEFAULT_ORGANIZATION_ID, 'extras': [{ "key": "sync_source_package_url", "value": package['ckan']['url'].strip('/') + '/dataset/{}'.format(package['ckan']['package_id']) }, { 'key': 'sync_source_org_description', 'value': source_package.get('organization', {}).get('description') }] }) assert res['success'], 'create package failed: {}'.format(res) target_package_id = res['result']['id'] with open(muni_filtered_csv_filename) as f: res = ckan.resource_create( instance_name, { 'package_id': target_package_id, 'description': resource_description, 'format': "CSV", 'name': resource_name, 'url': os.path.basename(muni_filtered_csv_filename), **({ "geo_lat_field": package['geo_wgs_parsing']["output_field_lat"], "geo_lon_field": package['geo_wgs_parsing']["output_field_lon"], } if package.get('geo_wgs_parsing') else {}), }, files={ 'upload': (os.path.basename(muni_filtered_csv_filename), f) }) assert res['success'], 'create resource failed: {}'.format(res) created_package = ckan.package_show(instance_name, target_package_id) created_package['private'] = False created_package_res = ckan.package_update( instance_name, created_package) assert created_package_res[ 'success'], 'failed to set package to public: {}'.format( created_package_res) else: print("no rows after muni filter") else: print("Package already exists ({})".format(existing_package['id'])) created_package = created_package_res[ 'result'] if created_package_res else existing_package created_package_group_names = [ g['name'] for g in created_package.get('groups', []) ] created_package['groups'] = [{ "name": name } for name in created_package_group_names] num_added_groups = 0 for group_name in package.get('groups', []): if group_name not in created_package_group_names: created_package['groups'].append({'name': group_name}) num_added_groups += 1 res = ckan.package_update(instance_name, created_package) assert res[ 'success'], 'failed to add groups to created package: {}'.format(res) print("Added {} groups".format(num_added_groups))