Beispiel #1
0
 def test_remoteckan_validlocations(self, project_config_yaml):
     Configuration._create(hdx_site='prod',
                           hdx_key='TEST_HDX_KEY',
                           hdx_config_dict={},
                           project_config_yaml=project_config_yaml)
     remoteckan = ckanapi.RemoteCKAN('http://lalala',
                                     apikey='12345',
                                     user_agent='HDXPythonLibrary/1.0')
     Configuration.read().setup_remoteckan(remoteckan)
     assert Configuration.read().remoteckan() == remoteckan
     remoteckan = ckanapi.RemoteCKAN('http://hahaha',
                                     apikey='54321',
                                     user_agent='HDXPythonLibrary/0.5')
     Configuration._create(remoteckan=remoteckan,
                           hdx_site='prod',
                           hdx_key='TEST_HDX_KEY',
                           hdx_config_dict={},
                           project_config_yaml=project_config_yaml)
     assert Configuration.read().remoteckan() == remoteckan
     Configuration.read()._remoteckan = None
     with pytest.raises(ConfigurationError):
         Configuration.read().remoteckan()
     Configuration.delete()
     with pytest.raises(ConfigurationError):
         Configuration.read().remoteckan()
Beispiel #2
0
def get_number_of_rows(site, resource_id, API_key=None):
    """Returns the number of rows in a datastore. Note that even when there is a limit
    placed on the number of results a CKAN API call can return, this function will
    still give the true number of rows."""
    ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
    results_dict = ckan.action.datastore_info(id=resource_id)
    try:
        ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
        results_dict = ckan.action.datastore_info(id=resource_id)
        return results_dict['meta']['count']
    except:
        return None
Beispiel #3
0
    def source_type(self, catalog, package):
        # AU, FI, IE, IT, MX, PY
        if package.get('source_type'):
            return normalize_source_type(package, package['source_type'])

        # IT
        elif '/api/rest/dataset/' in package['url']:
            url, name = package['url'].split('api/rest/dataset/', 1)
            return self.source_type(catalog, ckanapi.RemoteCKAN(url).call_action('package_show', {'id': name}))

        # US
        # @see https://github.com/ckan/ckanext-spatial/blob/master/doc/harvesters.rst
        # @see https://github.com/GSA/ckanext-geodatagov/tree/master/ckanext/geodatagov/harvesters
        elif package.get('extras'):
            source_type = next(extra['value'] for extra in package['extras'] if extra['key'] == 'source_type')
            # @see https://github.com/GSA/ckanext-geodatagov/blob/master/ckanext/geodatagov/harvesters/base.py#L174
            if source_type == 'single-doc':
                response = self.get(package['url'])
                if response.status_code == 200:
                    try:
                        return normalize_metadata_scheme(response)
                    except lxml.etree.XMLSyntaxError:
                        pass
            # @see https://github.com/GSA/ckanext-geodatagov/blob/master/ckanext/geodatagov/harvesters/waf_collection.py
            elif source_type == 'waf-collection':
                # @see https://github.com/GSA/ckanext-geodatagov/blob/master/ckanext/geodatagov/validation/__init__.py
                config = json.loads(next(extra['value'] for extra in package['extras'] if extra['key'] == 'config'))
                if config.get('validator_profiles'):
                    if len(config['validator_profiles']) > 1:
                        self.warning('multiple validator_profiles for {}'.format(catalog.dataset_api_url(package)))
                    else:
                        return 'waf-{}'.format(validators[config['validator_profiles'][0]])
                else:
                    response = self.get(config['collection_metadata_url'])
                    if response.status_code == 200:
                        scheme = normalize_metadata_scheme(response)
                        if scheme:
                            return 'waf-{}'.format(scheme)
            else:
                normalized = normalize_source_type(package, source_type)
                if normalized:
                    return normalized

        # BR
        else:
            try:
                if ckanapi.RemoteCKAN(package['url']).call_action('site_read'):
                    return 'ckan'
            except ckanapi.errors.CKANAPIError:
                pass
Beispiel #4
0
    def federation(self):
        frame = defaultdict(lambda: defaultdict(int))
        for catalog in self.catalogs:
            # Assumes we don't need to paginate.
            if issubclass(catalog.scraper, CKAN):
                client = ckanapi.RemoteCKAN(catalog.url, get_only=catalog.get_only)
                package_search = client.call_action('package_search', {'fq': 'type:harvest', 'rows': 300000}, verify=catalog.verify)

                if package_search['results']:
                    for package in package_search['results']:
                        source_type = self.source_type(catalog, package)
                        if source_type:
                            frame[source_type][catalog.division_id] += 1
                        else:
                            self.warning('could not determine source type of {}'.format(catalog.dataset_api_url(package)))
                # GB
                else:
                    try:
                        for package in client.call_action('harvest_source_list', verify=catalog.verify):
                            if package['active']:
                                source_type = normalize_source_type(package, package['type'])
                                if source_type:
                                    frame[source_type][catalog.division_id] += 1
                                else:
                                    self.warning('could not determine source type of {}'.format(catalog.harvest_api_url(package)))
                    except ckanapi.errors.CKANAPIError:
                        pass
            elif catalog.scraper.__name__ == 'Socrata':
                if 'federation_filter' in self.get(catalog.url).text:
                    frame['socrata'][catalog.division_id] = 1
        return pd.DataFrame(frame)
Beispiel #5
0
def get_package_parameter(site, package_id, parameter=None, API_key=None):
    """Gets a CKAN package parameter. If no parameter is specified, all metadata
    for that package is returned."""
    # Some package parameters you can fetch from the WPRDC with
    # this function are:
    # 'geographic_unit', 'owner_org', 'maintainer', 'data_steward_email',
    # 'relationships_as_object', 'access_level_comment',
    # 'frequency_publishing', 'maintainer_email', 'num_tags', 'id',
    # 'metadata_created', 'group', 'metadata_modified', 'author',
    # 'author_email', 'state', 'version', 'department', 'license_id',
    # 'type', 'resources', 'num_resources', 'data_steward_name', 'tags',
    # 'title', 'frequency_data_change', 'private', 'groups',
    # 'creator_user_id', 'relationships_as_subject', 'data_notes',
    # 'name', 'isopen', 'url', 'notes', 'license_title',
    # 'temporal_coverage', 'related_documents', 'license_url',
    # 'organization', 'revision_id'
    ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
    metadata = ckan.action.package_show(id=package_id)
    if parameter is None:
        return metadata
    else:
        if parameter in metadata:
            return metadata[parameter]
        else:
            return None
Beispiel #6
0
    def generate_datastore(self, fields, clear, first, wipe_data):
        if wipe_data and first:
            # Delete all the records in the datastore, preserving the schema.
            ckan = ckanapi.RemoteCKAN(site, apikey=self.key)
            response = ckan.action.datastore_delete(id=self.resource_id,
                                                    filters={},
                                                    force=True)
            # Deleting the records in the datastore also has the side effect of deactivating the
            # datastore, so we need to reactivate it.
            response2 = ckan.action.resource_patch(id=self.resource_id,
                                                   datastore_active=True)
        elif clear and first:
            delete_status = self.delete_datastore(self.resource_id)
            if str(delete_status)[0] in ['4', '5']:
                if str(delete_status) == '404':
                    print(
                        "The datastore currently doesn't exist, so let's create it!"
                    )
                else:
                    raise RuntimeError(
                        'Delete failed with status code {}.'.format(
                            str(delete_status)))
            self.create_datastore(self.resource_id, fields)

        elif self.resource_id is None:
            self.resource_id = self.create_resource(self.package_id,
                                                    self.resource_name)
            self.create_datastore(self.resource_id, fields)

        return self.resource_id
Beispiel #7
0
def get_data_dictionary(site, resource_id, API_key=None):
    try:
        ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
        results = ckan.action.datastore_search(resource_id=resource_id)
        return results['fields']
    except ckanapi.errors.NotFound:  # Either the resource doesn't exist, or it doesn't have a datastore.
        return None
def updateCkanCount(portal, endpoint, orgName=None):
    orgs = cl.sql("select cartodb_id, data_portal_url, datasets from lga_datasets where data_portal='%s'" % portal)['rows']
    #print orgs
    for row in orgs:
        ckan = ckanapi.RemoteCKAN(endpoint, user_agent='opencouncildata.org')
        ckan.get_only = True
        if orgName == None:
            org = re.search('organization/([^/]+)/?$', row['data_portal_url']).group(1)
            # Warning: data.gov.au only returns first 10 datasets if using include_datasets=True.
            try:
                num_datasets = ckan.action.organization_show(id=org, include_datasets=False)['package_count']
            except:
                print "ERROR with organisation %s. Did its endpoint change?" % org
                num_datasets = 0

        else:
            
            num_datasets = len(ckan.action.package_list())
            org = orgName # Bleh. Just trying to find a way to handle single-organisation CKANs.
        try:
            print "%s: %d (was %d)" % (org, num_datasets, row['datasets'])
        except TypeError:
            pass      
        cl.sql("UPDATE lga_datasets SET datasets='%d' WHERE cartodb_id='%d'" % (num_datasets, row['cartodb_id']))
        updateDatasetCount(org, num_datasets)
Beispiel #9
0
def update_topic_terms(url, api_key):

    ckan = ckanapi.RemoteCKAN(url, api_key)

    terms = get_terms()

    for topic in terms:
        if not terms[topic]:
            continue

        try:
            topic_dict = ckan.action.group_show(id=topic)
        except ckanapi.errors.NotFound:
            continue

        existing = topic_dict.get('harvest_terms')

        if not existing:
            combined = sorted(terms[topic])
        else:
            combined = sorted(list(set(terms[topic] + existing)))

        topic_dict['harvest_terms'] = combined

        try:
            ckan.action.group_update(**topic_dict)
            print('Update topic "{}" with terms {}'.format(topic, combined))
        except ckanapi.ValidationError as error:
            print('Error: %s' % error)
Beispiel #10
0
 def init(self):
     if not self.ckan_api:
         self.ckan_api = ckanapi.RemoteCKAN(
             'https://datahub.io',
             user_agent='ckanapiexample/1.0 (+http://data.wiserd.ac.uk)',
             apikey=ckan_api_key)
     return self.ckan_api
Beispiel #11
0
def de_dup2(site_url):
    count = 0
    start = 0
    rows = 50
    #remote site
    site = ckanapi.RemoteCKAN(site_url,
                              apikey=None,
                              user_agent='ckanapi-uploader/1.0')
    records = defaultdict(list)
    while True:
        # a list with a hard upper limit 1000, need to loop
        p_records = site.action.package_search(q='',
                                               use_default_schema=True,
                                               start=start,
                                               rows=rows)
        if count == 0:
            count = p_records['count']
        for v in p_records['results']:
            res = v['resources'][0]
            name = json.loads(res['name_translated'])['en']
            url = res['url']
            md5 = res['hash']
            records[md5].append([name, url])
        start += len(p_records['results'])
        if start >= count:
            break
    print 'Total records ', count

    for k, vl in records.iteritems():
        if len(vl) <= 1: continue
        print vl
Beispiel #12
0
    def setup_organizations(self, repo_name=None):
        """ Check that the organizations in the configuration file exist
        and if not create them.
        :param repo_name: Only setup the organization for that repo config.
        """
        # Validate config
        check_cfg(
            self._cfg,
            ['repos', 'api_key', 'ckan_url'],
        )
        api_key = self._cfg['api_key']

        for repo in self._cfg['repos']:
            check_cfg(repo, ['bucket', 'org_name', 'org_title'],
                      name='the repo config')
            if repo_name is not None and repo['bucket'] != repo_name:
                continue
            # Prepare a CKAN connection for use.
            ckan_host = self._cfg['ckan_url']
            org_name = repo['org_name']
            site = ckanapi.RemoteCKAN(ckan_host, apikey=api_key)
            orgs = site.action.organization_list()
            if org_name not in orgs:
                self.logger.info(
                    "Organization %s does not exist yet, creating one..." %
                    (org_name))
                site.action.organization_create(name=org_name,
                                                title=repo['org_title'],
                                                description=repo['org_title'])
            else:
                self.logger.info(
                    "Organization %s already exists, skipping setup" %
                    (org_name))
Beispiel #13
0
def query_resource(site, query, API_key=None):
    # Use the datastore_search_sql API endpoint to query a CKAN resource.

    # Note that this doesn't work for private datasets.
    # The relevant CKAN GitHub issue has been closed.
    # https://github.com/ckan/ckan/issues/1954
    ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
    response = ckan.action.datastore_search_sql(sql=query)
    # A typical response is a dictionary like this
    #{u'fields': [{u'id': u'_id', u'type': u'int4'},
    #             {u'id': u'_full_text', u'type': u'tsvector'},
    #             {u'id': u'pin', u'type': u'text'},
    #             {u'id': u'number', u'type': u'int4'},
    #             {u'id': u'total_amount', u'type': u'float8'}],
    # u'records': [{u'_full_text': u"'0001b00010000000':1 '11':2 '13585.47':3",
    #               u'_id': 1,
    #               u'number': 11,
    #               u'pin': u'0001B00010000000',
    #               u'total_amount': 13585.47},
    #              {u'_full_text': u"'0001c00058000000':3 '2':2 '7827.64':1",
    #               u'_id': 2,
    #               u'number': 2,
    #               u'pin': u'0001C00058000000',
    #               u'total_amount': 7827.64},
    #              {u'_full_text': u"'0001c01661006700':3 '1':1 '3233.59':2",
    #               u'_id': 3,
    #               u'number': 1,
    #               u'pin': u'0001C01661006700',
    #               u'total_amount': 3233.59}]
    # u'sql': u'SELECT * FROM "d1e80180-5b2e-4dab-8ec3-be621628649e" LIMIT 3'}
    data = response['records']
    return data
def main(local_git_path, remote_git_url, ckan_url, package_name):
    logger.info("Updating local repository at %s from %s.", local_git_path,
                remote_git_url)
    local_repo = update_local_git_repo(local_git_path, remote_git_url)
    logger.info("Done.")

    logger.info("Looking for newest log entry...")
    latest_log_entry = get_newest_log_entry(local_git_path)
    logger.info("Done: '%s'.", latest_log_entry)

    logger.info("Getting package url from CKAN...")
    ckan = ckanapi.RemoteCKAN(ckan_url)
    package_info = ckan.call_action('package_show', {"id": package_name})
    logger.info("Done: %s", package_info)

    timetable_url = package_info['resources'][0]['url']
    logger.info("Downloading %s...", timetable_url)
    temp_file_path, http_message = request.urlretrieve(timetable_url)
    logger.info("Done: %s, HTTP message: %s.", temp_file_path, http_message)

    logger.info("Calculating checksum of the downloaded file...")
    new_checksum = calculate_checksum(temp_file_path)
    logger.info("Done: '%s'.", new_checksum)

    if not latest_log_entry or new_checksum != latest_log_entry.checksum:
        logger.info(
            "NEW TIMETABLE DETECTED. Checksum of the downloaded data is different than the newest log entry."
        )
        logger.info("Extracting downloaded archive...")
        files_extracted = extract_new_file(temp_file_path, local_git_path)
        logger.info("Done: %s", files_extracted)

        current_utc_time = datetime.datetime.utcnow()
        local_repo.index.add(files_extracted)

        for now_missing_file in pathlib.Path(local_git_dir).rglob(
                "*.txt").filter(lambda p: str(p) not in files_extracted):
            local_repo.index.remove(str(now_missing_file))

        logger.info("Committing extracted files...")
        new_files_commit = local_repo.index.commit(
            "Nowy rozkład: {}".format(current_utc_time))
        logger.info("Done: %s.", new_files_commit)

        logger.info("Inserting and committing new log entry...")
        new_log_entry = UpdateLogEntry(current_utc_time, new_checksum,
                                       new_files_commit.hexsha)
        insert_log_entry_in_table(local_git_path, new_log_entry)
        local_repo.index.add([os.path.join(local_git_path, "README.md")])
        log_modification_commit = local_repo.index.commit(
            "Nowy wpis w logu {}".format(current_utc_time))
        logger.info("Done: %s.", log_modification_commit)

        logger.info("Pushing changes to %s...", remote_git_url)
        local_repo.remote().push()
        logger.info("Done")
    else:
        logger.info(
            "Checksum of the downloaded data is the same as the newest log entry. No action made."
        )
Beispiel #15
0
def datastore_exists(package_id, resource_name):
    """Check whether a datastore exists for the given package ID and resource name.

    If there should be a datastore but it's inactive, try to restore it. If
    restoration fails, send a notification.
    """
    from engine.credentials import site, API_key
    resource_id = find_resource_id(package_id, resource_name)
    if resource_id is None:
        return False
    datastore_is_active = get_resource_parameter(site, resource_id,
                                                 'datastore_active', API_key)
    if datastore_is_active:
        return True
    else:
        url = get_resource_parameter(site, resource_id, 'url', API_key)
        if re.search('datastore/dump', url) is not None:
            # This looks like a resource that has a datastore that is inactive.
            # Try restoring it.
            ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
            response = ckan.action.resource_patch(id=resource_id,
                                                  datastore_active=True)
            if response['datastore_active']:
                print("Restored inactive datastore.")
            else:
                msg = f"Unable to restore inactive datastore for resource ID {resource_id}, resource name {resource_name} and package_id {package_id}!"
                channel = "@david"  #if (test_mode or not PRODUCTION) else "#etl-hell" # test_mode is not available to this function.
                if channel != "@david":
                    msg = f"@david {msg}"
                send_to_slack(msg,
                              username='******',
                              channel=channel,
                              icon=':illuminati:')
            return response['datastore_active']
Beispiel #16
0
def set_data_dictionary(resource_id, old_fields):
    # Here "old_fields" needs to be in the same format as the data dictionary
    # returned by get_data_dictionary: a list of type dicts and info dicts.
    # Though the '_id" field needs to be removed for this to work.
    from engine.credentials import site, API_key
    if old_fields[0]['id'] == '_id':
        old_fields = old_fields[1:]

    # Note that a subset can be sent, and they will update part of
    # the integrated data dictionary.
    ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
    present_fields = get_data_dictionary(resource_id)
    new_fields = []
    # Attempt to restore data dictionary, taking into account the deletion and addition of fields, and ignoring any changes in type.
    # Iterate through the fields in the data dictionary and try to apply them to the newly created data table.
    for field in present_fields:
        if field['id'] != '_id':
            definition = next((f.get('info', None)
                               for f in old_fields if f['id'] == field['id']),
                              None)
            if definition is not None:
                nf = dict(field)
                nf['info'] = definition
                new_fields.append(nf)

    results = ckan.action.datastore_create(resource_id=resource_id,
                                           fields=new_fields,
                                           force=True)
    # The response without force=True is
    # ckanapi.errors.ValidationError: {'__type': 'Validation Error', 'read-only': ['Cannot edit read-only resource. Either pass"force=True" or change url-type to "datastore"']}
    # With force=True, it works.

    return results
Beispiel #17
0
def main(datapackage, ckanapikey, dry, ckan):
    """Main dispatcher function for publishing a dataset"""
    if check_ckan_url(ckan) is False:
        click.echo("{} isn't a valid url".format(ckan))
        raise TypeError
    datapackage_json, upload_object = load_datapackage_file(datapackage)
    ctdata = ckanapi.RemoteCKAN(ckan, apikey=ckanapikey, user_agent='CTData Publisher/1.0 (+http://ctdata.org)')
    package_root_dir = datapackage.split('/')[0]
    if not dry:
        # First we will create the new dataset or overwrite the existing dataset 
        try:
            r = create(upload_object, ctdata, ckanapikey)
        except Exception as e:
            raise e

        if r.status_code == 200:
            click.echo("{} Created".format(upload_object['title']))
        else:
            raise Exception

        # Then we will upload the resource
        try:
            upload_resource(datapackage_json, ctdata, package_root_dir)
        except Exception as e:
            raise e

        click.echo("{} Uploaded".format(datapackage_json['resources'][0]['path']))
    else:
        click.echo(json.dumps(upload_object, indent=4, separators=(',', ': ')))
Beispiel #18
0
def get_ckanapi(config_ini_or_ckan_url, **kwargs):
    '''Given a config.ini filepath or a remote CKAN URL, returns a ckanapi
    instance that you can use to call action commands
    '''
    import ConfigParser
    print 'Connecting to CKAN...'
    import ckanapi
    import sys
    if config_ini_or_ckan_url.startswith('http'):
        # looks like a hostname e.g. https://data.gov.uk
        ckan_url = config_ini_or_ckan_url
        # Load the apikey from a config file
        config = ConfigParser.ConfigParser()
        config_filepath = '~/.ckan'
        try:
            config.read(os.path.expanduser(config_filepath))
            apikey = config.get(ckan_url, 'apikey')
        except ConfigParser.Error, e:
            print 'Error reading file with api keys configured: %s' % e
            print 'Ensure you have a file: %s' % config_filepath
            print 'With the api key of the ckan user "script", something like:'
            print '  [%s]' % ckan_url
            print '  apikey = fb3355-b55234-4549baac'
            sys.exit(1)
        ckan = ckanapi.RemoteCKAN(ckan_url,
                                  apikey=apikey,
                                  user_agent='dgu script',
                                  **kwargs)
Beispiel #19
0
 def __init__(self, instance='dev'):
     instances = {'dev': 'config/dev.json', 'prod': 'config/prod.json'}
     config = Load.loadJSONFile(instances.get(instance))
     ckan = ckanapi.RemoteCKAN(user_agent='ckanapi/1.0',
                               apikey=config['ckan']['api'],
                               address=config['ckan']['url'])
     self.ckan = ckan
Beispiel #20
0
def loadTables():
    """
    This function loads all feather tables in filepath into workspace.
    
    """
    dir_path = os.path.join(table_dir, 'feather')

    try:
        files = glob(os.path.join(dir_path, '*.feather'))
        names = [f.rpartition('.')[0] for f in os.listdir(dir_path)]
        tables = {}
        for n, f in zip(names, files):
            tables[n] = feather.read_dataframe(f)

    except:
        #fetch tables from energydata.uct.ac.za
        ckan = ckanapi.RemoteCKAN('http://energydata.uct.ac.za/',
                                  get_only=True)
        resources = ckan.action.package_show(id='dlr-database-tables-94-14')
        names = [r['name'] for r in resources['resources']]
        ids = [r['id'] for r in resources['resources']]
        tables = {}
        for n, i in zip(names, ids):
            d = ckan.action.datastore_search(resource_id=i)['records']
            tables[n] = pd.DataFrame(d)

    return tables
def write_register(register_name, data):
    """ Using the provided name, write the data parameter as a new dataset on
        data.gov.uk.  If the name is in use, check if it is a register and
        abort/create as necessary """
    ckan = ckanapi.RemoteCKAN('https://test.data.gov.uk', apikey='')

    res = ckan.action.package_search(q='name:{}'.format(register_name))

    if res['count'] > 0:
        print 'Dataset {} already exists, checking'.format(register_name)

        # Iterate through all of the results looking to see if any are a
        # register at which point we will just happily return.  If not then
        # we'll create one by incrementing the number.  We *should* only get
        # one or zero back from this call, but just to be sure....
        for possible_register in res['results']:
            # TODO: need to check extras here......
            if is_register(possible_register):
                print '  Existing register {} valid'.format(register_name)
                return

        new_name = '{}-{}'.format(register_name, res['count'] + 1)
        data['name'] = new_name
        write_register(new_name, data)
    else:
        print '  Creating dataset for register {}'.format(register_name)
        try:
            ckan.action.package_create(**data)
            print '  ... created'
        except ckanapi.errors.ValidationError, e:
            print '  Looks like the URL was already in use'
Beispiel #22
0
    def get_record_counts(**kwargs):
        ckan = ckanapi.RemoteCKAN(address=kwargs["address"],
                                  apikey=kwargs["apikey"])

        packages = kwargs.pop("ti").xcom_pull(
            task_ids="get_all_packages")["packages"]
        datastore_resources = []

        for p in packages:
            for r in p["resources"]:
                if r["url_type"] != "datastore":
                    continue
                res = ckan.action.datastore_search(id=r["id"], limit=0)

                datastore_resources.append({
                    "package_id": p["title"],
                    "resource_id": r["id"],
                    "resource_name": r["name"],
                    "extract_job": r["extract_job"],
                    "row_count": res["total"],
                    "fields": res["fields"],
                })

                logging.info(
                    f'{p["name"]}: {r["name"]} - {res["total"]} records')

        logging.info(
            f"Identified {len(datastore_resources)} datastore resources")

        return datastore_resources
Beispiel #23
0
def appData():
    
    #fetch tables from energydata.uct.ac.za
    apikey = input('Enter your APIKEY from http://energydata.uct.ac.za/user/YOUR_USERNAME: '******'Authorization':apikey}
    ckan = ckanapi.RemoteCKAN('http://energydata.uct.ac.za/', apikey=apikey, get_only=True)

    tables = ckan.action.package_show(id='dlr-database-tables-94-14')        
    for i in range(0, len(tables['resources'])):
        name = tables['resources'][i]['name']
        print('... fetching ' + name + ' from energydata.uct.ac.za')
        r_url = tables['resources'][i]['url']
        # Download resources from data portal
        request = urllib.request.Request(r_url, headers = headers)
        with urllib.request.urlopen(request) as response, open(os.path.join(csv_table, name + '.csv'), 'wb') as out_file:
            shutil.copyfileobj(response, out_file)            
        table = pd.read_csv(os.path.join(csv_table, name + '.csv'))            
        #write profiles to disk                
        feather.write_dataframe(table, os.path.join(feather_table, name + '.feather'))
    
    profiles = ckan.action.package_show(id='dlr-seasonal-adtd-profiles')        
    for i in range(0, len(profiles['resources'])):
        name = profiles['resources'][i]['name']
        print('... fetching ' + profiles['resources'][i]['name'] + ' from energydata.uct.ac.za')
        r_url = profiles['resources'][i]['url']
        # Download resources from data portal
        request = urllib.request.Request(r_url, headers = headers)
        with urllib.request.urlopen(request) as response, open(os.path.join(csv_adtd, name + '.csv'), 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
        adtd = pd.read_csv(os.path.join(csv_adtd, name + '.csv'))
        #write profiles to disk                
        feather.write_dataframe(adtd, os.path.join(feather_adtd, name + '.feather'))
    return
Beispiel #24
0
def csv_view(request, resource_id, field, search_term):
    # Create the HttpResponse object with the appropriate CSV header.
    site = DEFAULT_SITE
    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format(
        search_term)

    writer = csv.writer(response)

    offset = 0
    chunk_size = 30000
    ckan = ckanapi.RemoteCKAN(site)
    written, total = get_and_write_next_rows(ckan,
                                             resource_id,
                                             field,
                                             search_term,
                                             writer,
                                             chunk_size,
                                             offset=0,
                                             written=0)

    while written < total:
        offset = offset + chunk_size
        written, total = get_and_write_next_rows(ckan, resource_id, field,
                                                 search_term, writer,
                                                 chunk_size, offset, written)

    return response
def main():
    parser = argparse.ArgumentParser(description='Generates CKAN datasets')
    parser.add_argument('-b', '--base', default='http://localhost:5000',
                        help='Base URL for CKAN API to post to [default: \'%(default)s\']')
    parser.add_argument('-a', '--apikey', default='tester',
                        help='API key to post with [default: \'%(default)s\']')
    parser.add_argument('-s', '--seed', default=0, type=int,
                        help='Random seed for user name generation [default: %(default)s]')
    parser.add_argument('-o', '--organization', default=None,
                        help='Join created users to organization')
    parser.add_argument('num_users', metavar='N', type=int,
                        help='Number of users')
    args = parser.parse_args()

    user_agent_ident = 'generate_ckan_users-{:%Y%m%d%H%M%S%f}'.format(datetime.datetime.utcnow())

    api = ckanapi.RemoteCKAN(args.base,
                             apikey=args.apikey,
                             user_agent='avoindata_ckanapi_users/1.0 ({0})'.format(user_agent_ident))

    num_users = args.num_users
    organization = args.organization

    random.seed(args.seed)
    names = name_generator()

    try:
        for index in range(num_users):
            name = next(names)
            create_user(api, name, organization)
    except ckanapi.errors.NotAuthorized:
        logging.error('ERROR: Not authorized to create users. Check your API key, CKAN configuration and auth functions for user_create')
Beispiel #26
0
def get_package_parameter(site, package_id, parameter, API_key=None):
    # Some package parameters you can fetch from the WPRDC with
    # this function are:
    # 'geographic_unit', 'owner_org', 'maintainer', 'data_steward_email',
    # 'relationships_as_object', 'access_level_comment',
    # 'frequency_publishing', 'maintainer_email', 'num_tags', 'id',
    # 'metadata_created', 'group', 'metadata_modified', 'author',
    # 'author_email', 'state', 'version', 'department', 'license_id',
    # 'type', 'resources', 'num_resources', 'data_steward_name', 'tags',
    # 'title', 'frequency_data_change', 'private', 'groups',
    # 'creator_user_id', 'relationships_as_subject', 'data_notes',
    # 'name', 'isopen', 'url', 'notes', 'license_title',
    # 'temporal_coverage', 'related_documents', 'license_url',
    # 'organization', 'revision_id'
    try:
        ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
        metadata = ckan.action.package_show(id=package_id)
        desired_string = metadata[parameter]
        #print("The parameter {} for this package is {}".format(parameter,metadata[parameter]))
    except:
        raise RuntimeError(
            "Unable to obtain package parameter '{}' for package with ID {}".
            format(parameter, package_id))
    #
    return desired_string
def get_csv_resources(dataset_name):
    api = ckanapi.RemoteCKAN(CKAN_URL)

    dataset = api.call_action("package_show", {"id": dataset_name})
    for resource in dataset["resources"]:
        if resource["format"] == "CSV":
            yield resource["url"]
Beispiel #28
0
def query_resource(site, query, API_key=None):
    """Use the datastore_search_sql API endpoint to query a CKAN resource."""
    ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
    response = ckan.action.datastore_search_sql(sql=query)
    # A typical response is a dictionary like this
    #{u'fields': [{u'id': u'_id', u'type': u'int4'},
    #             {u'id': u'_full_text', u'type': u'tsvector'},
    #             {u'id': u'pin', u'type': u'text'},
    #             {u'id': u'number', u'type': u'int4'},
    #             {u'id': u'total_amount', u'type': u'float8'}],
    # u'records': [{u'_full_text': u"'0001b00010000000':1 '11':2 '13585.47':3",
    #               u'_id': 1,
    #               u'number': 11,
    #               u'pin': u'0001B00010000000',
    #               u'total_amount': 13585.47},
    #              {u'_full_text': u"'0001c00058000000':3 '2':2 '7827.64':1",
    #               u'_id': 2,
    #               u'number': 2,
    #               u'pin': u'0001C00058000000',
    #               u'total_amount': 7827.64},
    #              {u'_full_text': u"'0001c01661006700':3 '1':1 '3233.59':2",
    #               u'_id': 3,
    #               u'number': 1,
    #               u'pin': u'0001C01661006700',
    #               u'total_amount': 3233.59}]
    # u'sql': u'SELECT * FROM "d1e80180-5b2e-4dab-8ec3-be621628649e" LIMIT 3'}
    data = response['records']
    return data
Beispiel #29
0
def main(admin):
    apikey = None
    if admin:
        with open(join(expanduser('~'), '.hdxkey'), 'rt') as f:
            apikey = f.read().replace('\n', '')

    remoteckan = ckanapi.RemoteCKAN('https://data.humdata.org/',
                                    apikey=apikey,
                                    user_agent='hdx-to-csv')
    df = DataFrame()
    start = 0
    total_rows = 10000
    for page in range(total_rows // 1000 + 1):
        data = dict()
        pagetimes1000 = page * 1000
        data['offset'] = start + pagetimes1000
        rows_left = total_rows - pagetimes1000
        rows = min(rows_left, 1000)
        data['limit'] = rows
        result = remoteckan.call_action('current_package_list_with_resources',
                                        data)
        if result:
            no_results = len(result)
            flat = flatten_json(result)
            norm = json_normalize(flat)
            df = df.append(norm)
            if no_results < rows:
                break
        else:
            logger.debug(result)
    df.to_csv('datasets.csv',
              encoding='utf-8',
              index=False,
              date_format='%Y-%m-%d',
              float_format='%.0f')
Beispiel #30
0
    def extract(self, working_folder, service_manifest):
        wf = os.path.join(working_folder, service_manifest.name)
        if not os.path.exists(wf):
            os.mkdir(wf)

        print "  Extracting content to {}".format(wf)

        print "  Fetching resource ({}) metadata".format(
            service_manifest.resource)
        ckan = ckanapi.RemoteCKAN(
            'https://data.gov.uk',
            user_agent='dgu_api_etl/0.1 (+https://data.gov.uk)')
        resource = ckan.action.resource_show(id=service_manifest.resource,
                                             requests_kwargs={'verify': False})

        target_file = os.path.join(
            wf, service_manifest.name + "." + resource['format'].lower())

        # TODO: Remove this ...
        if os.environ.get('DEV') and os.path.exists(target_file):
            print "  Skipping file during dev"
        else:
            r = requests.get(resource['url'], stream=True, verify=False)
            with open(target_file, 'wb') as f:
                for chunk in r.iter_content(chunk_size=4096):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        return target_file