def fetch(resource_id, **kwargs): """Downloads a filestore resource""" verbose = not kwargs['quiet'] filepath = kwargs['destination'] name_from_id = kwargs.get('name_from_id') chunksize = kwargs.get('chunksize_bytes') ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} ckan = CKAN(**ckan_kwargs) try: r = ckan.fetch_resource(resource_id) except api.NotAuthorized as err: sys.exit('ERROR: %s\n' % str(err)) else: fkwargs = { 'headers': r.headers, 'name_from_id': name_from_id, 'resource_id': resource_id} filepath = tup.make_filepath(filepath, **fkwargs) tio.write(filepath, r.iter_content, chunksize=chunksize) # save encoding to extended attributes x = xattr(filepath) if verbose and r.encoding: print('saving encoding %s to extended attributes' % r.encoding) if r.encoding: x['com.ckanny.encoding'] = r.encoding print(filepath)
def upload(source, resource_id=None, **kwargs): """Uploads a file to a datastore table""" verbose = not kwargs['quiet'] resource_id = resource_id or p.splitext(p.basename(source))[0] if '.' in resource_id: resource_id = resource_id.split('.')[0] ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} if verbose: print( 'Uploading %s to datastore resource %s...' % (source, resource_id)) # read encoding from extended attributes x = xattr(source) try: kwargs['encoding'] = x.get('com.ckanny.encoding') except IOError: pass if verbose and kwargs['encoding']: print('Using encoding %s' % kwargs['encoding']) ckan = CKAN(**ckan_kwargs) if ckan.update_datastore(resource_id, source, **kwargs): print('Success! Resource %s uploaded.' % resource_id) else: sys.exit('ERROR: resource %s not uploaded.' % resource_id)
def upload(source, resource_id=None, package_id=None, **kwargs): """Updates the filestore of an existing resource or creates a new one""" verbose = not kwargs['quiet'] resource_id = resource_id or p.splitext(p.basename(source))[0] ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} if package_id and verbose: print( 'Creating filestore resource %s in dataset %s...' % (source, package_id)) elif verbose: print( 'Uploading %s to filestore resource %s...' % (source, resource_id)) ckan = CKAN(**ckan_kwargs) resource_kwargs = { 'url' if 'http' in source else 'filepath': source, 'name': kwargs.get('name') } if package_id: resource = ckan.create_resource(package_id, **resource_kwargs) else: resource = ckan.update_filestore(resource_id, **resource_kwargs) if package_id and resource and verbose: infix = '%s ' % resource['id'] if resource.get('id') else '' print('Success! Resource %screated.' % infix) elif resource and verbose: print('Success! Resource %s updated.' % resource_id) elif not resource: sys.exit('Error uploading file!')
def upload(source, resource_id=None, **kwargs): """Uploads a file to a datastore table""" verbose = not kwargs['quiet'] resource_id = resource_id or p.splitext(p.basename(source))[0] if '.' in resource_id: resource_id = resource_id.split('.')[0] ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} if verbose: print('Uploading %s to datastore resource %s...' % (source, resource_id)) # read encoding from extended attributes x = xattr(source) try: kwargs['encoding'] = x.get('com.ckanny.encoding') except IOError: pass if verbose and kwargs['encoding']: print('Using encoding %s' % kwargs['encoding']) ckan = CKAN(**ckan_kwargs) if ckan.update_datastore(resource_id, source, **kwargs): print('Success! Resource %s uploaded.' % resource_id) else: sys.exit('ERROR: resource %s not uploaded.' % resource_id)
def update(endpoint, **kwargs): """ Updates the database Args: endpoint (str): The api resource url. kwargs (dict): passed to CKAN constructor. Kwargs: chunk_size (int): Number of rows to process at a time (default: All). row_limit (int): Total number of rows to process (default: All). err_limit (int): Number of errors to encounter before failing (default: Inf). Returns: (dict): Update details """ start = timer() pid = kwargs.pop('pid', None) chunk_size = kwargs.pop('chunk_size', 0) row_limit = kwargs.pop('row_limit', None) err_limit = kwargs.pop('err_limit', None) rows = 0 ckan = CKAN(**kwargs) if pid: pids = [pid] else: org_show = partial(ckan.organization_show, include_datasets=True) orgs_basic = ckan.organization_list(permission='read') org_ids = it.imap(itemgetter('id'), orgs_basic) orgs = (org_show(id=org_id) for org_id in org_ids) package_lists = it.imap(itemgetter('packages'), orgs) pid_getter = partial(map, itemgetter('id')) pids = it.chain.from_iterable(it.imap(pid_getter, package_lists)) data = gen_data(ckan, pids, kwargs.get('mock_freq')) errors = {} for records in tup.chunk(data, min(row_limit or 'inf', chunk_size)): rs = map(partial(patch_or_post, endpoint), records) rows += len(filter(lambda r: r.ok, rs)) ids = map(itemgetter('dataset_id'), records) errors.update(dict((k, r.json()) for k, r in zip(ids, rs) if not r.ok)) if row_limit and rows >= row_limit: break if err_limit and len(errors) >= err_limit: raise Exception(errors) elapsed_time = ' ,'.join(fmt_elapsed(timer() - start)) return {'rows_added': rows, 'errors': errors, 'elapsed_time': elapsed_time}
def status(): """ Displays the current status """ kwargs = {k: parse(v) for k, v in request.args.to_dict().items()} ckan = CKAN(**kwargs) resp = { 'online': True, 'message': 'Service for checking and updating HDX dataset ages.', 'CKAN_instance': ckan.address, 'version': __version__, 'repository': c.REPO } return jsonify(**resp)
def migrate(resource_id, **kwargs): """Copies a filestore resource from one ckan instance to another""" src_remote, dest_remote = kwargs['src_remote'], kwargs['dest_remote'] if src_remote == dest_remote: msg = ( 'ERROR: `dest-remote` of %s is the same as `src-remote` of %s.\n' 'The dest and src remotes must be different.\n' % (src_remote, dest_remote)) sys.exit(msg) verbose = not kwargs['quiet'] chunksize = kwargs['chunksize_bytes'] ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} src_ckan = CKAN(remote=src_remote, **ckan_kwargs) dest_ckan = CKAN(remote=dest_remote, **ckan_kwargs) try: r = src_ckan.fetch_resource(resource_id) filepath = NamedTemporaryFile(delete=False).name except api.NotAuthorized as err: sys.exit('ERROR: %s\n' % str(err)) except Exception as err: sys.exit('ERROR: %s\n' % str(err)) else: tio.write(filepath, r.raw.read(), chunksize=chunksize) resource = dest_ckan.update_filestore(resource_id, filepath=filepath) if resource and verbose: print('Success! Resource %s updated.' % resource_id) elif not resource: sys.exit('Error uploading file!') finally: if verbose: print('Removing tempfile...') unlink(filepath)
def update(pid, **kwargs): """Updates a package (aka dataset)""" kw = ft.Objectify(kwargs, type='dataset') verbose = not kw.quiet ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} ckan = CKAN(**ckan_kwargs) licenses = it.imap(itemgetter('id'), ckan.license_list()) groups = ckan.group_list() raw_tags = filter(None, kw.tags.split(',')) if kw.tags else [] tags = [{'state': 'active', 'name': t} for t in raw_tags] if kw.start: start = parse(str(kw.start)).strftime('%m/%d/%Y') else: date = None if kw.start and kw.end: date = '%s-%s' % (start, parse(str(kw.end)).strftime('%m/%d/%Y')) elif kw.start: date = start if kw.location and kw.location in set(groups): group_list = [{'name': kw.location}] elif kw.location: sys.exit('group name: %s not found!' % kw.location) else: group_list = [] if kw.license_id and kw.license_id not in set(licenses): sys.exit('license id: %s not found!' % kw.license_id) package_kwargs = { 'title': kw.title, 'name': kw.name, 'license_id': kw.license_id, 'dataset_source': kw.source, 'notes': kw.description or kw.title, 'type': kw.type, 'tags': tags, 'groups': group_list, 'dataset_date': date, 'caveats': kw.caveats, 'methodology': methods.get(kw.methodology, 'Other'), 'methodology_other': methods.get(kw.methodology) or kw.methodology, } try: old_package = ckan.package_show(id=pid) except api.ValidationError as e: exit(e) if any(package_kwargs.values()): # combine keys by returning the last non-empty result pred = lambda key: True last = lambda pair: filter(None, pair)[-1] if any(pair) else None records = [old_package, package_kwargs] new_kwargs = pr.merge(records, pred=pred, op=last) if verbose: print('Submitting your package request.') pprint(new_kwargs) print('\n') package = ckan.package_update(**new_kwargs) else: package = old_package if kw.private: org = package['organization'] ckan.package_privatize(org_id=org['id'], datasets=[package['id']]) print(package['id']) print('\n')
def create(org_id, **kwargs): """Creates a package (aka dataset)""" kw = ft.Objectify(kwargs, type='dataset') verbose = not kw.quiet ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} ckan = CKAN(**ckan_kwargs) licenses = it.imap(itemgetter('id'), ckan.license_list()) orgs = ckan.organization_list() org_ids = it.imap(itemgetter('id'), orgs) org_names = it.imap(itemgetter('name'), orgs) groups = ckan.group_list() name = kw.name or slugify(kw.title) raw_tags = filter(None, kw.tags.split(',')) tags = [{'state': 'active', 'name': t} for t in raw_tags] if kw.start: start = parse(str(kw.start)).strftime('%m/%d/%Y') else: date = None if kw.start and kw.end: date = '%s-%s' % (start, parse(str(kw.end)).strftime('%m/%d/%Y')) elif kw.start: date = start if kw.location in set(groups): group_list = [{'name': kw.location}] elif kw.location: sys.exit('group name: %s not found!' % kw.location) else: group_list = [] if org_id not in set(it.chain(org_ids, org_names)): sys.exit('organization id: %s not found!' % org_id) if kw.license_id not in set(licenses): sys.exit('license id: %s not found!' % kw.license_id) files = filter(None, kw.files.split(',')) names = filter(None, kw.names.split(',')) resource_list = list(it.starmap(make_rkwargs, zip(files, names))) or [] package_kwargs = { 'title': kw.title, 'name': name, 'license_id': kw.license_id, 'owner_org': org_id, 'dataset_source': kw.source, 'notes': kw.description or kw.title, 'type': kw.type, 'tags': tags, 'resources': resource_list, 'package_creator': ckan.user['name'], 'groups': group_list, 'dataset_date': date, 'caveats': kw.caveats, 'methodology': methods.get(kw.methodology, 'Other'), 'methodology_other': methods.get(kw.methodology) or kw.methodology, } if verbose: print('Submitting your package request.') pprint(package_kwargs) print('\n') try: package = ckan.package_create(**package_kwargs) except api.ValidationError as e: exit(e) if kw.private: org = package['organization'] ckan.package_privatize(org_id=org['id'], datasets=[package['id']]) if verbose: print('Your package response.') pprint(package) print('\n') print(package['id']) print('\n')
def update(three_dub_id, topline_id=None, **kwargs): ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} ckan = CKAN(**ckan_kwargs) ds.update(topline_id, ckan=ckan) if topline_id else None ds.update(three_dub_id, ckan=ckan)
def delete(resource_id, **kwargs): """Deletes a datastore table""" ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} ckan = CKAN(**ckan_kwargs) ckan.delete_table(resource_id, filters=kwargs.get('filters'))
def update(resource_id, force=None, **kwargs): """Updates a datastore table based on the current filestore resource""" verbose = not kwargs.get('quiet') chunk_bytes = kwargs.get('chunk_bytes', api.CHUNKSIZE_BYTES) ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} hash_kwargs = {'chunksize': chunk_bytes, 'verbose': verbose} ckan = CKAN(**ckan_kwargs) try: r = ckan.fetch_resource(resource_id) except (api.NotFound, api.NotAuthorized) as err: sys.exit('ERROR: %s\n' % str(err)) else: f = SpooledTemporaryFile(suffix='.xlsx', mode='r+b') write_kwargs = { 'length': r.headers.get('content-length'), 'chunksize': chunk_bytes } tio.write(f, r.iter_content, **write_kwargs) try: old_hash = ckan.get_hash(resource_id) except api.NotFound as err: item = err.args[0]['item'] if item == 'package': orgs = ckan.organization_list(permission='admin_group') owner_org = ( o['id'] for o in orgs if o['display_name'] == kwargs['hash_group']).next() package_kwargs = { 'name': kwargs['hash_table'], 'owner_org': owner_org, 'package_creator': 'Hash Table', 'dataset_source': 'Multiple sources', 'notes': 'Datastore resource hash table' } ckan.hash_table_pack = ckan.package_create(**package_kwargs) if item in {'package', 'resource'}: fileobj = StringIO('datastore_id,hash\n') create_kwargs = {'fileobj': fileobj, 'name': api.DEF_HASH_RES} table = kwargs['hash_table'] resource = ckan.create_resource(table, **create_kwargs) ckan.hash_table_id = resource['id'] ckan.create_hash_table(verbose) old_hash = ckan.get_hash(resource_id) new_hash = tio.hash_file(f, **hash_kwargs) changed = new_hash != old_hash if old_hash else True if verbose: print(get_message(changed, force)) if not (changed or force): sys.exit(0) kwargs['encoding'] = r.encoding kwargs['content_type'] = r.headers['content-type'] updated = ckan.update_datastore(resource_id, f, **kwargs) if updated and verbose: print('Success! Resource %s updated.' % resource_id) if updated and changed: ckan.update_hash_table(resource_id, new_hash, verbose) elif not updated: sys.exit('ERROR: resource %s not updated.' % resource_id)
def customize(org_id, **kwargs): """Introspects custom organization values""" verbose = not kwargs['quiet'] ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} image_sq = kwargs.get('image_sq') image_rect = kwargs.get('image_rect') sanitize = kwargs.get('sanitize') three_dub_id = kwargs.get('3w') geojson_id = kwargs.get('geojson') topline_id = kwargs.get('topline') ckan = CKAN(**ckan_kwargs) organization = ckan.organization_show(id=org_id, include_datasets=True) org_packages = organization['packages'] hdx = ckan.organization_show(id='hdx', include_datasets=True) extras = {e['key']: e['value'] for e in organization['extras']} if three_dub_id: three_dub_set_id = ckan.get_package_id(three_dub_id) else: ids = ckan.find_ids(org_packages, pnamed='3w', ptagged='3w') three_dub_set_id = ids['pname'] three_dub_id = ids['rid'] if not three_dub_id: sys.exit(1) if not topline_id: topline_id = ckan.find_ids(org_packages, pnamed='topline')['rid'] if geojson_id: geojson_set_id = ckan.get_package_id(geojson_id) else: country = org_id.split('-')[1] hkwargs = {'pnamed': 'json-repository', 'rnamed': country} ids = ckan.find_ids(hdx['packages'], **hkwargs) geojson_set_id = ids['pname'] geojson_id = ids['rid'] viz_url = '%s/dataset/%s' % (kwargs['remote'], three_dub_set_id) three_dub_r = ckan.fetch_resource(three_dub_id) _fields = three_dub_r.iter_lines().next().split(',') three_dub_fields = tup.underscorify(_fields) if sanitize else _fields if geojson_id: geojson_r = ckan.fetch_resource(geojson_id) geojson_fields = geojson_r.json()['features'][0]['properties'].keys() else: geojson_fields = [] if verbose: print('3w fields:') pprint(three_dub_fields) print('geojson fields:') pprint(geojson_fields) def_where = tup.find(three_dub_fields, geojson_fields) or '' who_column = find_field(three_dub_fields, 'who', **kwargs) what_column = find_field(three_dub_fields, 'what', **kwargs) where_column = find_field(three_dub_fields, 'where', def_where, **kwargs) where_column_2 = find_field(geojson_fields, 'where', def_where, **kwargs) name_column = kwargs.get('where') or def_where if 'http' not in image_sq: gdocs = 'https://docs.google.com' image_sq = '%s/uc?id=%s&export=download' % (gdocs, image_sq) if 'http' not in image_rect: gdocs = 'https://docs.google.com' image_rect = '%s/uc?id=%s&export=download' % (gdocs, image_rect) data = { 'name': org_id, 'resource_id_1': three_dub_id, 'resource_id_2': geojson_id, 'topline_resource': topline_id, 'datatype_1': kwargs.get('datatype_1') or 'datastore', 'datatype_2': kwargs.get('datatype_2') or 'filestore', 'org_url': extras['org_url'], 'description': organization['description'], 'title': organization['title'], 'image_sq': image_sq, 'image_rect': image_rect, 'highlight_color': kwargs.get('color'), 'dataset_id_1': three_dub_set_id, 'dataset_id_2': geojson_set_id, 'who_column': deref_field(three_dub_fields, who_column), 'what_column': deref_field(three_dub_fields, what_column), 'where_column': deref_field(three_dub_fields, where_column), 'where_column_2': deref_field(geojson_fields, where_column_2), 'map_district_name_column': deref_field(geojson_fields, name_column), 'viz_data_link_url': viz_url, 'visualization_select': kwargs.get('viz_type', '3W-dashboard'), 'viz_title': kwargs.get('viz_title', "Who's doing what and where?"), 'colors': [ '#c6d5ed', '#95b5df', '#659ad2', '#026bb5', '#659ad2', '#213b68', '#101d4e', '#000035'], 'use_org_color': True, 'modified_at': int(time()), } control_sheet_data = [data[k] for k in control_sheet_keys] if verbose: print('\nCustom pages control sheet data:') print(control_sheet_data) return control_sheet_data
def customize(org_id, **kwargs): """Introspects custom organization values""" verbose = not kwargs['quiet'] ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} image_sq = kwargs.get('image_sq') image_rect = kwargs.get('image_rect') sanitize = kwargs.get('sanitize') three_dub_id = kwargs.get('3w') geojson_id = kwargs.get('geojson') topline_id = kwargs.get('topline') ckan = CKAN(**ckan_kwargs) organization = ckan.organization_show(id=org_id, include_datasets=True) org_packages = organization['packages'] hdx = ckan.organization_show(id='hdx', include_datasets=True) extras = {e['key']: e['value'] for e in organization['extras']} if three_dub_id: three_dub_set_id = ckan.get_package_id(three_dub_id) else: ids = ckan.find_ids(org_packages, pnamed='3w', ptagged='3w') three_dub_set_id = ids['pname'] three_dub_id = ids['rid'] if not three_dub_id: sys.exit(1) if not topline_id: topline_id = ckan.find_ids(org_packages, pnamed='topline')['rid'] if geojson_id: geojson_set_id = ckan.get_package_id(geojson_id) else: country = org_id.split('-')[1] hkwargs = {'pnamed': 'json-repository', 'rnamed': country} ids = ckan.find_ids(hdx['packages'], **hkwargs) geojson_set_id = ids['pname'] geojson_id = ids['rid'] viz_url = '%s/dataset/%s' % (kwargs['remote'], three_dub_set_id) three_dub_r = ckan.fetch_resource(three_dub_id) _fields = three_dub_r.iter_lines().next().split(',') three_dub_fields = tup.underscorify(_fields) if sanitize else _fields if geojson_id: geojson_r = ckan.fetch_resource(geojson_id) geojson_fields = geojson_r.json()['features'][0]['properties'].keys() else: geojson_fields = [] if verbose: print('3w fields:') pprint(three_dub_fields) print('geojson fields:') pprint(geojson_fields) def_where = tup.find(three_dub_fields, geojson_fields) or '' who_column = find_field(three_dub_fields, 'who', **kwargs) what_column = find_field(three_dub_fields, 'what', **kwargs) where_column = find_field(three_dub_fields, 'where', def_where, **kwargs) where_column_2 = find_field(geojson_fields, 'where', def_where, **kwargs) name_column = kwargs.get('where') or def_where if 'http' not in image_sq: gdocs = 'https://docs.google.com' image_sq = '%s/uc?id=%s&export=download' % (gdocs, image_sq) if 'http' not in image_rect: gdocs = 'https://docs.google.com' image_rect = '%s/uc?id=%s&export=download' % (gdocs, image_rect) data = { 'name': org_id, 'resource_id_1': three_dub_id, 'resource_id_2': geojson_id, 'topline_resource': topline_id, 'datatype_1': kwargs.get('datatype_1') or 'datastore', 'datatype_2': kwargs.get('datatype_2') or 'filestore', 'org_url': extras['org_url'], 'description': organization['description'], 'title': organization['title'], 'image_sq': image_sq, 'image_rect': image_rect, 'highlight_color': kwargs.get('color'), 'dataset_id_1': three_dub_set_id, 'dataset_id_2': geojson_set_id, 'who_column': deref_field(three_dub_fields, who_column), 'what_column': deref_field(three_dub_fields, what_column), 'where_column': deref_field(three_dub_fields, where_column), 'where_column_2': deref_field(geojson_fields, where_column_2), 'map_district_name_column': deref_field(geojson_fields, name_column), 'viz_data_link_url': viz_url, 'visualization_select': kwargs.get('viz_type', '3W-dashboard'), 'viz_title': kwargs.get('viz_title', "Who's doing what and where?"), 'colors': [ '#c6d5ed', '#95b5df', '#659ad2', '#026bb5', '#659ad2', '#213b68', '#101d4e', '#000035' ], 'use_org_color': True, 'modified_at': int(time()), } control_sheet_data = [data[k] for k in control_sheet_keys] if verbose: print('\nCustom pages control sheet data:') print(control_sheet_data) return control_sheet_data