def test_remoteckan_validlocations(self, project_config_yaml): Configuration._create(hdx_site='prod', hdx_key='TEST_HDX_KEY', hdx_config_dict={}, project_config_yaml=project_config_yaml) remoteckan = ckanapi.RemoteCKAN('http://lalala', apikey='12345', user_agent='HDXPythonLibrary/1.0') Configuration.read().setup_remoteckan(remoteckan) assert Configuration.read().remoteckan() == remoteckan remoteckan = ckanapi.RemoteCKAN('http://hahaha', apikey='54321', user_agent='HDXPythonLibrary/0.5') Configuration._create(remoteckan=remoteckan, hdx_site='prod', hdx_key='TEST_HDX_KEY', hdx_config_dict={}, project_config_yaml=project_config_yaml) assert Configuration.read().remoteckan() == remoteckan Configuration.read()._remoteckan = None with pytest.raises(ConfigurationError): Configuration.read().remoteckan() Configuration.delete() with pytest.raises(ConfigurationError): Configuration.read().remoteckan()
def get_number_of_rows(site, resource_id, API_key=None): """Returns the number of rows in a datastore. Note that even when there is a limit placed on the number of results a CKAN API call can return, this function will still give the true number of rows.""" ckan = ckanapi.RemoteCKAN(site, apikey=API_key) results_dict = ckan.action.datastore_info(id=resource_id) try: ckan = ckanapi.RemoteCKAN(site, apikey=API_key) results_dict = ckan.action.datastore_info(id=resource_id) return results_dict['meta']['count'] except: return None
def source_type(self, catalog, package): # AU, FI, IE, IT, MX, PY if package.get('source_type'): return normalize_source_type(package, package['source_type']) # IT elif '/api/rest/dataset/' in package['url']: url, name = package['url'].split('api/rest/dataset/', 1) return self.source_type(catalog, ckanapi.RemoteCKAN(url).call_action('package_show', {'id': name})) # US # @see https://github.com/ckan/ckanext-spatial/blob/master/doc/harvesters.rst # @see https://github.com/GSA/ckanext-geodatagov/tree/master/ckanext/geodatagov/harvesters elif package.get('extras'): source_type = next(extra['value'] for extra in package['extras'] if extra['key'] == 'source_type') # @see https://github.com/GSA/ckanext-geodatagov/blob/master/ckanext/geodatagov/harvesters/base.py#L174 if source_type == 'single-doc': response = self.get(package['url']) if response.status_code == 200: try: return normalize_metadata_scheme(response) except lxml.etree.XMLSyntaxError: pass # @see https://github.com/GSA/ckanext-geodatagov/blob/master/ckanext/geodatagov/harvesters/waf_collection.py elif source_type == 'waf-collection': # @see https://github.com/GSA/ckanext-geodatagov/blob/master/ckanext/geodatagov/validation/__init__.py config = json.loads(next(extra['value'] for extra in package['extras'] if extra['key'] == 'config')) if config.get('validator_profiles'): if len(config['validator_profiles']) > 1: self.warning('multiple validator_profiles for {}'.format(catalog.dataset_api_url(package))) else: return 'waf-{}'.format(validators[config['validator_profiles'][0]]) else: response = self.get(config['collection_metadata_url']) if response.status_code == 200: scheme = normalize_metadata_scheme(response) if scheme: return 'waf-{}'.format(scheme) else: normalized = normalize_source_type(package, source_type) if normalized: return normalized # BR else: try: if ckanapi.RemoteCKAN(package['url']).call_action('site_read'): return 'ckan' except ckanapi.errors.CKANAPIError: pass
def federation(self): frame = defaultdict(lambda: defaultdict(int)) for catalog in self.catalogs: # Assumes we don't need to paginate. if issubclass(catalog.scraper, CKAN): client = ckanapi.RemoteCKAN(catalog.url, get_only=catalog.get_only) package_search = client.call_action('package_search', {'fq': 'type:harvest', 'rows': 300000}, verify=catalog.verify) if package_search['results']: for package in package_search['results']: source_type = self.source_type(catalog, package) if source_type: frame[source_type][catalog.division_id] += 1 else: self.warning('could not determine source type of {}'.format(catalog.dataset_api_url(package))) # GB else: try: for package in client.call_action('harvest_source_list', verify=catalog.verify): if package['active']: source_type = normalize_source_type(package, package['type']) if source_type: frame[source_type][catalog.division_id] += 1 else: self.warning('could not determine source type of {}'.format(catalog.harvest_api_url(package))) except ckanapi.errors.CKANAPIError: pass elif catalog.scraper.__name__ == 'Socrata': if 'federation_filter' in self.get(catalog.url).text: frame['socrata'][catalog.division_id] = 1 return pd.DataFrame(frame)
def get_package_parameter(site, package_id, parameter=None, API_key=None): """Gets a CKAN package parameter. If no parameter is specified, all metadata for that package is returned.""" # Some package parameters you can fetch from the WPRDC with # this function are: # 'geographic_unit', 'owner_org', 'maintainer', 'data_steward_email', # 'relationships_as_object', 'access_level_comment', # 'frequency_publishing', 'maintainer_email', 'num_tags', 'id', # 'metadata_created', 'group', 'metadata_modified', 'author', # 'author_email', 'state', 'version', 'department', 'license_id', # 'type', 'resources', 'num_resources', 'data_steward_name', 'tags', # 'title', 'frequency_data_change', 'private', 'groups', # 'creator_user_id', 'relationships_as_subject', 'data_notes', # 'name', 'isopen', 'url', 'notes', 'license_title', # 'temporal_coverage', 'related_documents', 'license_url', # 'organization', 'revision_id' ckan = ckanapi.RemoteCKAN(site, apikey=API_key) metadata = ckan.action.package_show(id=package_id) if parameter is None: return metadata else: if parameter in metadata: return metadata[parameter] else: return None
def generate_datastore(self, fields, clear, first, wipe_data): if wipe_data and first: # Delete all the records in the datastore, preserving the schema. ckan = ckanapi.RemoteCKAN(site, apikey=self.key) response = ckan.action.datastore_delete(id=self.resource_id, filters={}, force=True) # Deleting the records in the datastore also has the side effect of deactivating the # datastore, so we need to reactivate it. response2 = ckan.action.resource_patch(id=self.resource_id, datastore_active=True) elif clear and first: delete_status = self.delete_datastore(self.resource_id) if str(delete_status)[0] in ['4', '5']: if str(delete_status) == '404': print( "The datastore currently doesn't exist, so let's create it!" ) else: raise RuntimeError( 'Delete failed with status code {}.'.format( str(delete_status))) self.create_datastore(self.resource_id, fields) elif self.resource_id is None: self.resource_id = self.create_resource(self.package_id, self.resource_name) self.create_datastore(self.resource_id, fields) return self.resource_id
def get_data_dictionary(site, resource_id, API_key=None): try: ckan = ckanapi.RemoteCKAN(site, apikey=API_key) results = ckan.action.datastore_search(resource_id=resource_id) return results['fields'] except ckanapi.errors.NotFound: # Either the resource doesn't exist, or it doesn't have a datastore. return None
def updateCkanCount(portal, endpoint, orgName=None): orgs = cl.sql("select cartodb_id, data_portal_url, datasets from lga_datasets where data_portal='%s'" % portal)['rows'] #print orgs for row in orgs: ckan = ckanapi.RemoteCKAN(endpoint, user_agent='opencouncildata.org') ckan.get_only = True if orgName == None: org = re.search('organization/([^/]+)/?$', row['data_portal_url']).group(1) # Warning: data.gov.au only returns first 10 datasets if using include_datasets=True. try: num_datasets = ckan.action.organization_show(id=org, include_datasets=False)['package_count'] except: print "ERROR with organisation %s. Did its endpoint change?" % org num_datasets = 0 else: num_datasets = len(ckan.action.package_list()) org = orgName # Bleh. Just trying to find a way to handle single-organisation CKANs. try: print "%s: %d (was %d)" % (org, num_datasets, row['datasets']) except TypeError: pass cl.sql("UPDATE lga_datasets SET datasets='%d' WHERE cartodb_id='%d'" % (num_datasets, row['cartodb_id'])) updateDatasetCount(org, num_datasets)
def update_topic_terms(url, api_key): ckan = ckanapi.RemoteCKAN(url, api_key) terms = get_terms() for topic in terms: if not terms[topic]: continue try: topic_dict = ckan.action.group_show(id=topic) except ckanapi.errors.NotFound: continue existing = topic_dict.get('harvest_terms') if not existing: combined = sorted(terms[topic]) else: combined = sorted(list(set(terms[topic] + existing))) topic_dict['harvest_terms'] = combined try: ckan.action.group_update(**topic_dict) print('Update topic "{}" with terms {}'.format(topic, combined)) except ckanapi.ValidationError as error: print('Error: %s' % error)
def init(self): if not self.ckan_api: self.ckan_api = ckanapi.RemoteCKAN( 'https://datahub.io', user_agent='ckanapiexample/1.0 (+http://data.wiserd.ac.uk)', apikey=ckan_api_key) return self.ckan_api
def de_dup2(site_url): count = 0 start = 0 rows = 50 #remote site site = ckanapi.RemoteCKAN(site_url, apikey=None, user_agent='ckanapi-uploader/1.0') records = defaultdict(list) while True: # a list with a hard upper limit 1000, need to loop p_records = site.action.package_search(q='', use_default_schema=True, start=start, rows=rows) if count == 0: count = p_records['count'] for v in p_records['results']: res = v['resources'][0] name = json.loads(res['name_translated'])['en'] url = res['url'] md5 = res['hash'] records[md5].append([name, url]) start += len(p_records['results']) if start >= count: break print 'Total records ', count for k, vl in records.iteritems(): if len(vl) <= 1: continue print vl
def setup_organizations(self, repo_name=None): """ Check that the organizations in the configuration file exist and if not create them. :param repo_name: Only setup the organization for that repo config. """ # Validate config check_cfg( self._cfg, ['repos', 'api_key', 'ckan_url'], ) api_key = self._cfg['api_key'] for repo in self._cfg['repos']: check_cfg(repo, ['bucket', 'org_name', 'org_title'], name='the repo config') if repo_name is not None and repo['bucket'] != repo_name: continue # Prepare a CKAN connection for use. ckan_host = self._cfg['ckan_url'] org_name = repo['org_name'] site = ckanapi.RemoteCKAN(ckan_host, apikey=api_key) orgs = site.action.organization_list() if org_name not in orgs: self.logger.info( "Organization %s does not exist yet, creating one..." % (org_name)) site.action.organization_create(name=org_name, title=repo['org_title'], description=repo['org_title']) else: self.logger.info( "Organization %s already exists, skipping setup" % (org_name))
def query_resource(site, query, API_key=None): # Use the datastore_search_sql API endpoint to query a CKAN resource. # Note that this doesn't work for private datasets. # The relevant CKAN GitHub issue has been closed. # https://github.com/ckan/ckan/issues/1954 ckan = ckanapi.RemoteCKAN(site, apikey=API_key) response = ckan.action.datastore_search_sql(sql=query) # A typical response is a dictionary like this #{u'fields': [{u'id': u'_id', u'type': u'int4'}, # {u'id': u'_full_text', u'type': u'tsvector'}, # {u'id': u'pin', u'type': u'text'}, # {u'id': u'number', u'type': u'int4'}, # {u'id': u'total_amount', u'type': u'float8'}], # u'records': [{u'_full_text': u"'0001b00010000000':1 '11':2 '13585.47':3", # u'_id': 1, # u'number': 11, # u'pin': u'0001B00010000000', # u'total_amount': 13585.47}, # {u'_full_text': u"'0001c00058000000':3 '2':2 '7827.64':1", # u'_id': 2, # u'number': 2, # u'pin': u'0001C00058000000', # u'total_amount': 7827.64}, # {u'_full_text': u"'0001c01661006700':3 '1':1 '3233.59':2", # u'_id': 3, # u'number': 1, # u'pin': u'0001C01661006700', # u'total_amount': 3233.59}] # u'sql': u'SELECT * FROM "d1e80180-5b2e-4dab-8ec3-be621628649e" LIMIT 3'} data = response['records'] return data
def main(local_git_path, remote_git_url, ckan_url, package_name): logger.info("Updating local repository at %s from %s.", local_git_path, remote_git_url) local_repo = update_local_git_repo(local_git_path, remote_git_url) logger.info("Done.") logger.info("Looking for newest log entry...") latest_log_entry = get_newest_log_entry(local_git_path) logger.info("Done: '%s'.", latest_log_entry) logger.info("Getting package url from CKAN...") ckan = ckanapi.RemoteCKAN(ckan_url) package_info = ckan.call_action('package_show', {"id": package_name}) logger.info("Done: %s", package_info) timetable_url = package_info['resources'][0]['url'] logger.info("Downloading %s...", timetable_url) temp_file_path, http_message = request.urlretrieve(timetable_url) logger.info("Done: %s, HTTP message: %s.", temp_file_path, http_message) logger.info("Calculating checksum of the downloaded file...") new_checksum = calculate_checksum(temp_file_path) logger.info("Done: '%s'.", new_checksum) if not latest_log_entry or new_checksum != latest_log_entry.checksum: logger.info( "NEW TIMETABLE DETECTED. Checksum of the downloaded data is different than the newest log entry." ) logger.info("Extracting downloaded archive...") files_extracted = extract_new_file(temp_file_path, local_git_path) logger.info("Done: %s", files_extracted) current_utc_time = datetime.datetime.utcnow() local_repo.index.add(files_extracted) for now_missing_file in pathlib.Path(local_git_dir).rglob( "*.txt").filter(lambda p: str(p) not in files_extracted): local_repo.index.remove(str(now_missing_file)) logger.info("Committing extracted files...") new_files_commit = local_repo.index.commit( "Nowy rozkład: {}".format(current_utc_time)) logger.info("Done: %s.", new_files_commit) logger.info("Inserting and committing new log entry...") new_log_entry = UpdateLogEntry(current_utc_time, new_checksum, new_files_commit.hexsha) insert_log_entry_in_table(local_git_path, new_log_entry) local_repo.index.add([os.path.join(local_git_path, "README.md")]) log_modification_commit = local_repo.index.commit( "Nowy wpis w logu {}".format(current_utc_time)) logger.info("Done: %s.", log_modification_commit) logger.info("Pushing changes to %s...", remote_git_url) local_repo.remote().push() logger.info("Done") else: logger.info( "Checksum of the downloaded data is the same as the newest log entry. No action made." )
def datastore_exists(package_id, resource_name): """Check whether a datastore exists for the given package ID and resource name. If there should be a datastore but it's inactive, try to restore it. If restoration fails, send a notification. """ from engine.credentials import site, API_key resource_id = find_resource_id(package_id, resource_name) if resource_id is None: return False datastore_is_active = get_resource_parameter(site, resource_id, 'datastore_active', API_key) if datastore_is_active: return True else: url = get_resource_parameter(site, resource_id, 'url', API_key) if re.search('datastore/dump', url) is not None: # This looks like a resource that has a datastore that is inactive. # Try restoring it. ckan = ckanapi.RemoteCKAN(site, apikey=API_key) response = ckan.action.resource_patch(id=resource_id, datastore_active=True) if response['datastore_active']: print("Restored inactive datastore.") else: msg = f"Unable to restore inactive datastore for resource ID {resource_id}, resource name {resource_name} and package_id {package_id}!" channel = "@david" #if (test_mode or not PRODUCTION) else "#etl-hell" # test_mode is not available to this function. if channel != "@david": msg = f"@david {msg}" send_to_slack(msg, username='******', channel=channel, icon=':illuminati:') return response['datastore_active']
def set_data_dictionary(resource_id, old_fields): # Here "old_fields" needs to be in the same format as the data dictionary # returned by get_data_dictionary: a list of type dicts and info dicts. # Though the '_id" field needs to be removed for this to work. from engine.credentials import site, API_key if old_fields[0]['id'] == '_id': old_fields = old_fields[1:] # Note that a subset can be sent, and they will update part of # the integrated data dictionary. ckan = ckanapi.RemoteCKAN(site, apikey=API_key) present_fields = get_data_dictionary(resource_id) new_fields = [] # Attempt to restore data dictionary, taking into account the deletion and addition of fields, and ignoring any changes in type. # Iterate through the fields in the data dictionary and try to apply them to the newly created data table. for field in present_fields: if field['id'] != '_id': definition = next((f.get('info', None) for f in old_fields if f['id'] == field['id']), None) if definition is not None: nf = dict(field) nf['info'] = definition new_fields.append(nf) results = ckan.action.datastore_create(resource_id=resource_id, fields=new_fields, force=True) # The response without force=True is # ckanapi.errors.ValidationError: {'__type': 'Validation Error', 'read-only': ['Cannot edit read-only resource. Either pass"force=True" or change url-type to "datastore"']} # With force=True, it works. return results
def main(datapackage, ckanapikey, dry, ckan): """Main dispatcher function for publishing a dataset""" if check_ckan_url(ckan) is False: click.echo("{} isn't a valid url".format(ckan)) raise TypeError datapackage_json, upload_object = load_datapackage_file(datapackage) ctdata = ckanapi.RemoteCKAN(ckan, apikey=ckanapikey, user_agent='CTData Publisher/1.0 (+http://ctdata.org)') package_root_dir = datapackage.split('/')[0] if not dry: # First we will create the new dataset or overwrite the existing dataset try: r = create(upload_object, ctdata, ckanapikey) except Exception as e: raise e if r.status_code == 200: click.echo("{} Created".format(upload_object['title'])) else: raise Exception # Then we will upload the resource try: upload_resource(datapackage_json, ctdata, package_root_dir) except Exception as e: raise e click.echo("{} Uploaded".format(datapackage_json['resources'][0]['path'])) else: click.echo(json.dumps(upload_object, indent=4, separators=(',', ': ')))
def get_ckanapi(config_ini_or_ckan_url, **kwargs): '''Given a config.ini filepath or a remote CKAN URL, returns a ckanapi instance that you can use to call action commands ''' import ConfigParser print 'Connecting to CKAN...' import ckanapi import sys if config_ini_or_ckan_url.startswith('http'): # looks like a hostname e.g. https://data.gov.uk ckan_url = config_ini_or_ckan_url # Load the apikey from a config file config = ConfigParser.ConfigParser() config_filepath = '~/.ckan' try: config.read(os.path.expanduser(config_filepath)) apikey = config.get(ckan_url, 'apikey') except ConfigParser.Error, e: print 'Error reading file with api keys configured: %s' % e print 'Ensure you have a file: %s' % config_filepath print 'With the api key of the ckan user "script", something like:' print ' [%s]' % ckan_url print ' apikey = fb3355-b55234-4549baac' sys.exit(1) ckan = ckanapi.RemoteCKAN(ckan_url, apikey=apikey, user_agent='dgu script', **kwargs)
def __init__(self, instance='dev'): instances = {'dev': 'config/dev.json', 'prod': 'config/prod.json'} config = Load.loadJSONFile(instances.get(instance)) ckan = ckanapi.RemoteCKAN(user_agent='ckanapi/1.0', apikey=config['ckan']['api'], address=config['ckan']['url']) self.ckan = ckan
def loadTables(): """ This function loads all feather tables in filepath into workspace. """ dir_path = os.path.join(table_dir, 'feather') try: files = glob(os.path.join(dir_path, '*.feather')) names = [f.rpartition('.')[0] for f in os.listdir(dir_path)] tables = {} for n, f in zip(names, files): tables[n] = feather.read_dataframe(f) except: #fetch tables from energydata.uct.ac.za ckan = ckanapi.RemoteCKAN('http://energydata.uct.ac.za/', get_only=True) resources = ckan.action.package_show(id='dlr-database-tables-94-14') names = [r['name'] for r in resources['resources']] ids = [r['id'] for r in resources['resources']] tables = {} for n, i in zip(names, ids): d = ckan.action.datastore_search(resource_id=i)['records'] tables[n] = pd.DataFrame(d) return tables
def write_register(register_name, data): """ Using the provided name, write the data parameter as a new dataset on data.gov.uk. If the name is in use, check if it is a register and abort/create as necessary """ ckan = ckanapi.RemoteCKAN('https://test.data.gov.uk', apikey='') res = ckan.action.package_search(q='name:{}'.format(register_name)) if res['count'] > 0: print 'Dataset {} already exists, checking'.format(register_name) # Iterate through all of the results looking to see if any are a # register at which point we will just happily return. If not then # we'll create one by incrementing the number. We *should* only get # one or zero back from this call, but just to be sure.... for possible_register in res['results']: # TODO: need to check extras here...... if is_register(possible_register): print ' Existing register {} valid'.format(register_name) return new_name = '{}-{}'.format(register_name, res['count'] + 1) data['name'] = new_name write_register(new_name, data) else: print ' Creating dataset for register {}'.format(register_name) try: ckan.action.package_create(**data) print ' ... created' except ckanapi.errors.ValidationError, e: print ' Looks like the URL was already in use'
def get_record_counts(**kwargs): ckan = ckanapi.RemoteCKAN(address=kwargs["address"], apikey=kwargs["apikey"]) packages = kwargs.pop("ti").xcom_pull( task_ids="get_all_packages")["packages"] datastore_resources = [] for p in packages: for r in p["resources"]: if r["url_type"] != "datastore": continue res = ckan.action.datastore_search(id=r["id"], limit=0) datastore_resources.append({ "package_id": p["title"], "resource_id": r["id"], "resource_name": r["name"], "extract_job": r["extract_job"], "row_count": res["total"], "fields": res["fields"], }) logging.info( f'{p["name"]}: {r["name"]} - {res["total"]} records') logging.info( f"Identified {len(datastore_resources)} datastore resources") return datastore_resources
def appData(): #fetch tables from energydata.uct.ac.za apikey = input('Enter your APIKEY from http://energydata.uct.ac.za/user/YOUR_USERNAME: '******'Authorization':apikey} ckan = ckanapi.RemoteCKAN('http://energydata.uct.ac.za/', apikey=apikey, get_only=True) tables = ckan.action.package_show(id='dlr-database-tables-94-14') for i in range(0, len(tables['resources'])): name = tables['resources'][i]['name'] print('... fetching ' + name + ' from energydata.uct.ac.za') r_url = tables['resources'][i]['url'] # Download resources from data portal request = urllib.request.Request(r_url, headers = headers) with urllib.request.urlopen(request) as response, open(os.path.join(csv_table, name + '.csv'), 'wb') as out_file: shutil.copyfileobj(response, out_file) table = pd.read_csv(os.path.join(csv_table, name + '.csv')) #write profiles to disk feather.write_dataframe(table, os.path.join(feather_table, name + '.feather')) profiles = ckan.action.package_show(id='dlr-seasonal-adtd-profiles') for i in range(0, len(profiles['resources'])): name = profiles['resources'][i]['name'] print('... fetching ' + profiles['resources'][i]['name'] + ' from energydata.uct.ac.za') r_url = profiles['resources'][i]['url'] # Download resources from data portal request = urllib.request.Request(r_url, headers = headers) with urllib.request.urlopen(request) as response, open(os.path.join(csv_adtd, name + '.csv'), 'wb') as out_file: shutil.copyfileobj(response, out_file) adtd = pd.read_csv(os.path.join(csv_adtd, name + '.csv')) #write profiles to disk feather.write_dataframe(adtd, os.path.join(feather_adtd, name + '.feather')) return
def csv_view(request, resource_id, field, search_term): # Create the HttpResponse object with the appropriate CSV header. site = DEFAULT_SITE response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format( search_term) writer = csv.writer(response) offset = 0 chunk_size = 30000 ckan = ckanapi.RemoteCKAN(site) written, total = get_and_write_next_rows(ckan, resource_id, field, search_term, writer, chunk_size, offset=0, written=0) while written < total: offset = offset + chunk_size written, total = get_and_write_next_rows(ckan, resource_id, field, search_term, writer, chunk_size, offset, written) return response
def main(): parser = argparse.ArgumentParser(description='Generates CKAN datasets') parser.add_argument('-b', '--base', default='http://localhost:5000', help='Base URL for CKAN API to post to [default: \'%(default)s\']') parser.add_argument('-a', '--apikey', default='tester', help='API key to post with [default: \'%(default)s\']') parser.add_argument('-s', '--seed', default=0, type=int, help='Random seed for user name generation [default: %(default)s]') parser.add_argument('-o', '--organization', default=None, help='Join created users to organization') parser.add_argument('num_users', metavar='N', type=int, help='Number of users') args = parser.parse_args() user_agent_ident = 'generate_ckan_users-{:%Y%m%d%H%M%S%f}'.format(datetime.datetime.utcnow()) api = ckanapi.RemoteCKAN(args.base, apikey=args.apikey, user_agent='avoindata_ckanapi_users/1.0 ({0})'.format(user_agent_ident)) num_users = args.num_users organization = args.organization random.seed(args.seed) names = name_generator() try: for index in range(num_users): name = next(names) create_user(api, name, organization) except ckanapi.errors.NotAuthorized: logging.error('ERROR: Not authorized to create users. Check your API key, CKAN configuration and auth functions for user_create')
def get_package_parameter(site, package_id, parameter, API_key=None): # Some package parameters you can fetch from the WPRDC with # this function are: # 'geographic_unit', 'owner_org', 'maintainer', 'data_steward_email', # 'relationships_as_object', 'access_level_comment', # 'frequency_publishing', 'maintainer_email', 'num_tags', 'id', # 'metadata_created', 'group', 'metadata_modified', 'author', # 'author_email', 'state', 'version', 'department', 'license_id', # 'type', 'resources', 'num_resources', 'data_steward_name', 'tags', # 'title', 'frequency_data_change', 'private', 'groups', # 'creator_user_id', 'relationships_as_subject', 'data_notes', # 'name', 'isopen', 'url', 'notes', 'license_title', # 'temporal_coverage', 'related_documents', 'license_url', # 'organization', 'revision_id' try: ckan = ckanapi.RemoteCKAN(site, apikey=API_key) metadata = ckan.action.package_show(id=package_id) desired_string = metadata[parameter] #print("The parameter {} for this package is {}".format(parameter,metadata[parameter])) except: raise RuntimeError( "Unable to obtain package parameter '{}' for package with ID {}". format(parameter, package_id)) # return desired_string
def get_csv_resources(dataset_name): api = ckanapi.RemoteCKAN(CKAN_URL) dataset = api.call_action("package_show", {"id": dataset_name}) for resource in dataset["resources"]: if resource["format"] == "CSV": yield resource["url"]
def query_resource(site, query, API_key=None): """Use the datastore_search_sql API endpoint to query a CKAN resource.""" ckan = ckanapi.RemoteCKAN(site, apikey=API_key) response = ckan.action.datastore_search_sql(sql=query) # A typical response is a dictionary like this #{u'fields': [{u'id': u'_id', u'type': u'int4'}, # {u'id': u'_full_text', u'type': u'tsvector'}, # {u'id': u'pin', u'type': u'text'}, # {u'id': u'number', u'type': u'int4'}, # {u'id': u'total_amount', u'type': u'float8'}], # u'records': [{u'_full_text': u"'0001b00010000000':1 '11':2 '13585.47':3", # u'_id': 1, # u'number': 11, # u'pin': u'0001B00010000000', # u'total_amount': 13585.47}, # {u'_full_text': u"'0001c00058000000':3 '2':2 '7827.64':1", # u'_id': 2, # u'number': 2, # u'pin': u'0001C00058000000', # u'total_amount': 7827.64}, # {u'_full_text': u"'0001c01661006700':3 '1':1 '3233.59':2", # u'_id': 3, # u'number': 1, # u'pin': u'0001C01661006700', # u'total_amount': 3233.59}] # u'sql': u'SELECT * FROM "d1e80180-5b2e-4dab-8ec3-be621628649e" LIMIT 3'} data = response['records'] return data
def main(admin): apikey = None if admin: with open(join(expanduser('~'), '.hdxkey'), 'rt') as f: apikey = f.read().replace('\n', '') remoteckan = ckanapi.RemoteCKAN('https://data.humdata.org/', apikey=apikey, user_agent='hdx-to-csv') df = DataFrame() start = 0 total_rows = 10000 for page in range(total_rows // 1000 + 1): data = dict() pagetimes1000 = page * 1000 data['offset'] = start + pagetimes1000 rows_left = total_rows - pagetimes1000 rows = min(rows_left, 1000) data['limit'] = rows result = remoteckan.call_action('current_package_list_with_resources', data) if result: no_results = len(result) flat = flatten_json(result) norm = json_normalize(flat) df = df.append(norm) if no_results < rows: break else: logger.debug(result) df.to_csv('datasets.csv', encoding='utf-8', index=False, date_format='%Y-%m-%d', float_format='%.0f')
def extract(self, working_folder, service_manifest): wf = os.path.join(working_folder, service_manifest.name) if not os.path.exists(wf): os.mkdir(wf) print " Extracting content to {}".format(wf) print " Fetching resource ({}) metadata".format( service_manifest.resource) ckan = ckanapi.RemoteCKAN( 'https://data.gov.uk', user_agent='dgu_api_etl/0.1 (+https://data.gov.uk)') resource = ckan.action.resource_show(id=service_manifest.resource, requests_kwargs={'verify': False}) target_file = os.path.join( wf, service_manifest.name + "." + resource['format'].lower()) # TODO: Remove this ... if os.environ.get('DEV') and os.path.exists(target_file): print " Skipping file during dev" else: r = requests.get(resource['url'], stream=True, verify=False) with open(target_file, 'wb') as f: for chunk in r.iter_content(chunk_size=4096): if chunk: f.write(chunk) f.flush() return target_file