Beispiel #1
0
    def get_user_info(self, user_id):
        """ GET to CKAN API to get list of admins
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.get.user_show
        """
        url = '{}{}?id={}'.format(self.base_url, self.user_show_url, user_id)
        headers = self.get_request_headers(include_api_key=True)
        logger.info(f'GET {url} headers:{headers}')
        try:
            req = requests.get(url, headers=headers)
        except Exception as e:
            error = 'ERROR getting users information: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR getting users information: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from users information {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Beispiel #2
0
    def validate_origin_dataset(self):
        # check required https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create

        if self.ckan_owner_org_id is None:
            error = 'Owner organization ID is required'
            self.errors.append(error)
            return False

        requireds = []

        if self.schema == 'usmetadata':
            requireds += ['accessLevel', 'identifier',
                          'contactPoint__fn', 'programCode',
                          'bureauCode', 'contactPoint__hasEmail',
                          'publisher', 'modified', 'keyword']

        ok = True
        for req in requireds:
            # read fields considering the __ separator
            identified = self.identify_origin_element(raw_field=req)
            if identified in [None, '']:
                error = f'"{req}" field could not be empty at origin dataset'
                self.errors.append(error)
                ok = False

        if not ok:
            logger.info(f'requires failed on {self.original_dataset}: {self.errors}')
        return ok
Beispiel #3
0
    def show_package(self, ckan_package_id_or_name):
        """ GET to CKAN API to show a package/dataset """

        url = '{}{}'.format(self.base_url, self.package_show_url)
        headers = self.get_request_headers(include_api_key=True)
        data = {'id': ckan_package_id_or_name}
        logger.info(f'GET {url} headers:{headers} data:{data}')
        try:
            req = requests.get(url, params=data, headers=headers)
        except Exception as e:
            error = 'ERROR showing CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content
        if req.status_code >= 400:
            error = 'ERROR showing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        content = req.content

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from show_package: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Beispiel #4
0
    def transform_to_ckan_dataset(self, existing_resources=None):

        valid, error = self.validate_origin_dataset()
        if not valid:
            raise Exception(f'Error validating origin dataset: {error}')

        dataset = self.original_dataset.get('iso_values', {})
        tags = dataset.get('tags', [])
        cleaned_tags = clean_tags(tags)
        self.ckan_dataset['tag_string'] = ','.join(cleaned_tags)

        # previous transformations at origin
        for old_field, field_ckan in self.mapped_fields.items():
            logger.debug(f'Connecting fields "{old_field}", "{field_ckan}"')
            # identify origin and set value to destination
            origin = self.identify_origin_element(raw_field=old_field)
            if origin is None:
                logger.debug(f'No data in origin for "{old_field}"')
            else:
                self.set_destination_element(raw_field=field_ckan, new_value=origin)
                logger.debug(f'Connected OK fields "{old_field}"="{origin}"')

        self.infer_resources()
        self.ckan_dataset['resources'] = self.transform_resources()

        # custom changes
        self.fix_licence_url()
        self.set_browse_graphic()
        self.set_temporal_extent()
        self.set_responsible_party()
        self.set_bbox()

        # define name (are uniques in CKAN instance)
        if 'name' not in self.ckan_dataset or self.ckan_dataset['name'] == '':
            self.ckan_dataset['name'] = self.generate_name(title=self.ckan_dataset['title'])

        # mandatory
        self.ckan_dataset['owner_org'] = self.ckan_owner_org_id

        # clean all empty unused values (can't pop keys while iterating)
        ckan_dataset_copy = self.ckan_dataset.copy()
        for k, v in self.ckan_dataset.items():
            if v is None:
                ckan_dataset_copy.pop(k)
        self.ckan_dataset = ckan_dataset_copy

        valid = self.validate_final_dataset()
        if not valid:
            raise Exception(f'Error validating final dataset: {self.errors} from {self.original_dataset}')

        logger.info('Dataset transformed {} OK'.format(self.original_dataset.get('identifier', '')))
        return self.ckan_dataset
Beispiel #5
0
    def get_xml_tree(self):
        if self.xml_tree is None:
            parser = letree.XMLParser(remove_blank_text=True)
            if type(self.xml_str) != str:
                logger.info('XML_STR is not str, is {}: {}'.format(
                    type(self.xml_str), self.xml_str))
                xml_str = str(self.xml_str)
            else:
                xml_str = self.xml_str

            # logger.debug(f'Parsing ISO XML {xml_str}')
            self.xml_tree = letree.fromstring(xml_str, parser=parser)
        return self.xml_tree
Beispiel #6
0
    def delete_all_harvest_sources(self,
                                   harvest_type='harvest',
                                   source_type='datajson'):
        logger.info(f'Deleting local harvest sources from {self.base_url}')
        deleted = []
        for harvest_sources in self.search_harvest_packages(
                harvest_type=harvest_type, source_type=source_type):
            for harvest_source in harvest_sources:

                harvest_source_name = harvest_source['name']
                if harvest_source_name in deleted:
                    #TODO fix duplicated
                    continue

                logger.info(f'Deleting local harvest {harvest_source_name}')
                res = self.delete_package(
                    ckan_package_id_or_name=harvest_source_name)
                if not res['success']:
                    raise Exception(f'Failed to delete {harvest_source_name}')
                else:
                    logger.info(f'Deleted {harvest_source_name}')
                    deleted.append(harvest_source_name)
                    deleted += 1

        logger.info(f'{deleted} harvest sources deleted')
        return deleted
Beispiel #7
0
    def create_organization(self, organization, check_if_exists=True):
        """ POST to CKAN API to create a new organization
            organization is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.organization_create
        """
        logger.info(f'**** Creating Organization {organization}')
        if check_if_exists:
            logger.info(f'Exists Organization? {organization}')
            res = self.show_organization(
                organization_id_or_name=organization['name'])
            if res['success']:
                # do not create
                logger.info(f'Avoid create Organization {organization}')
                return res

        url = '{}{}'.format(self.base_url, self.organization_create_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        organization = json.dumps(organization)

        logger.info(f'POST {url} headers:{headers} data:{organization}')

        try:
            req = requests.post(url, data=organization, headers=headers)
        except Exception as e:
            error = 'ERROR creating [POST] organization: {} [{}]'.format(
                url, e)
            raise

        content = req.content

        if req.status_code >= 400:

            error = ('ERROR creating [STATUS] organization: {}'
                     '\n\t Status code: {}'
                     '\n\t content:{}'
                     '\n\t Dataset {}'.format(url, req.status_code, content,
                                              organization))
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            logger.error(error)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Beispiel #8
0
    def import_harvest_sources(
            self,
            catalog_url,
            method='GET',  # depend on CKAN version, GET for older versions
            on_duplicated='DELETE',
            harvest_type='harvest',
            source_type='datajson',
            delete_local_harvest_sources=True):
        """ import harvest sources from another CKAN open data portal """

        if delete_local_harvest_sources:
            deleted = self.delete_all_harvest_sources(source_type=source_type)

        logger.info(f'Getting external harvest sources for {catalog_url}')
        external_portal = CKANPortalAPI(base_url=catalog_url)

        total_sources = 0
        search_external = external_portal.search_harvest_packages(
            method=method, harvest_type=harvest_type, source_type=source_type)
        for external_harvest_sources in search_external:
            for external_harvest_source in external_harvest_sources:
                name = external_harvest_source['name']

                organization = external_harvest_source['organization']
                logger.info(f'**** Importing Organization {organization}')
                # copy organization locally
                del organization['id']  # drop original ID
                del organization['created']
                del organization['revision_id']
                res = self.create_organization(organization=organization)
                owner_org_id = organization['name']

                config = external_harvest_source.get('config', {})
                # res = self.delete_package(name)
                logger.info(external_harvest_source)
                res = self.create_harvest_source(
                    title=external_harvest_source['title'],
                    url=external_harvest_source['url'],
                    owner_org_id=owner_org_id,
                    name=name,
                    config=config,
                    notes=external_harvest_source['notes'],
                    source_type=source_type,
                    frequency=external_harvest_source['frequency'],
                    on_duplicated=on_duplicated)

                if not res['success']:
                    raise Exception(f'Failed to import harvest source {name}')
                else:
                    logger.info(f'Created {name}')
                    total_sources += 1

        return total_sources
    def fetch(self, timeout=30):
        """ download de data.json file """
        logger.info(f'Fetching data from {self.url}')
        if self.url is None:
            error = "No URL defined"
            self.errors.append(error)
            logger.error(error)
            raise Exception(error)

        try:
            req = requests.get(self.url, timeout=timeout)
        except Exception as e:
            error = 'ERROR Donwloading data: {} [{}]'.format(self.url, e)
            self.errors.append(error)
            logger.error(error)
            raise

        logger.info(f'Data fetched status {req.status_code}')
        if req.status_code >= 400:
            error = '{} HTTP error: {}'.format(self.url, req.status_code)
            self.errors.append(error)
            logger.error(error)
            raise Exception(error)

        logger.info(f'Data fetched OK')
        self.raw_data_json = req.content
Beispiel #10
0
    def show_organization(
            self,
            organization_id_or_name,
            method='POST'):  # troubles using 2.3 and 2.8 CKAN versions):
        """ GET to CKAN API to show a organization """

        url = '{}{}'.format(self.base_url, self.organization_show_url)
        headers = self.get_request_headers()
        data = {'id': organization_id_or_name}
        logger.info(f'POST {url} headers:{headers} data:{data}')
        try:
            if method == 'POST':
                req = requests.post(url, data=data, headers=headers)
            else:
                req = requests.get(url, params=data, headers=headers)
        except Exception as e:
            error = 'ERROR showing organization: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400 and req.status_code != 404:
            error = 'ERROR showing organization: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from show_organization: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Beispiel #11
0
    def update_package(self, ckan_package):
        """ POST to CKAN API to update a package/dataset
            ckan_package is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.update.package_update
        """
        url = '{}{}'.format(self.base_url, self.package_update_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        ckan_package = json.dumps(ckan_package)

        logger.info(f'POST {url} headers:{headers} data:{ckan_package}')
        try:
            req = requests.post(url, data=ckan_package, headers=headers)
        except Exception as e:
            error = 'ERROR creating CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR updateing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Beispiel #12
0
    def transform_to_ckan_dataset(self, existing_resources=None):
        # check how to parse
        # https://github.com/GSA/ckanext-datajson/blob/07ca20e0b6dc1898f4ca034c1e073e0c27de2015/ckanext/datajson/parse_datajson.py#L5
        # if we are updating existing dataset we need to merge resources

        logger.info('Transforming data.json dataset {}'.format(self.original_dataset.get('identifier', '')))
        valid = self.validate_origin_dataset()
        if not valid:
            # raise Exception(f'Error validating origin dataset: {error}')
            return None

        datajson_dataset = self.original_dataset
        tags = datajson_dataset.get('keyword', [])
        cleaned_tags = clean_tags(tags)
        self.ckan_dataset['tag_string'] = ','.join(cleaned_tags)

        # previous transformations at origin
        for old_field, field_ckan in self.mapped_fields.items():
            logger.debug(f'Connecting fields "{old_field}", "{field_ckan}"')
            # identify origin and set value to destination
            origin = self.identify_origin_element(raw_field=old_field)
            if origin is None:
                logger.debug(f'No data in origin for "{old_field}"')
            else:
                self.set_destination_element(raw_field=field_ckan, new_value=origin)
                logger.debug(f'Connected OK fields "{old_field}"="{origin}"')

        # transform distribution into resources
        distribution = datajson_dataset['distribution'] if 'distribution' in datajson_dataset else []
        # if _distribution_ is empty then we try to create them from "accessURL" or "webService" URLs
        if distribution is None or distribution == []:
            distribution = self.infer_resources()

        self.ckan_dataset['resources'] = self.transform_resources(distribution)

        # move out the resources with validation errores
        # and log the error as a dataset error
        final_resources = []
        for resource in self.ckan_dataset['resources']:
            if 'error' in resource:
                self.errors.append(resource)
            else:
                final_resources.append(resource)
        self.ckan_dataset['resources'] = final_resources

        if existing_resources is not None:
            res = self.merge_resources(existing_resources=existing_resources, new_resources=self.ckan_dataset['resources'])
            self.ckan_dataset['resources'] = res

        # add custom extras
        # add source_datajson_identifier = {"key": "source_datajson_identifier", "value": True}
        self.set_destination_element(raw_field='extras__source_datajson_identifier', new_value=True)

        # define name (are uniques in CKAN instance)
        if 'name' not in self.ckan_dataset or self.ckan_dataset['name'] == '':
            name = self.generate_name(title=self.ckan_dataset['title'])
            self.ckan_dataset['name'] = name

        # mandatory
        self.ckan_dataset['owner_org'] = self.ckan_owner_org_id

        # check for license
        if datajson_dataset.get('license', None) not in [None, '']:
            original_license = datajson_dataset['license']
            original_license = original_license.replace('http://', '')
            original_license = original_license.replace('https://', '')
            original_license = original_license.rstrip('/')
            license_id = ckan_settings.LICENCES.get(original_license, "other-license-specified")
            self.ckan_dataset['license_id'] = license_id

        # define publisher as extras as we expect
        publisher = datajson_dataset.get('publisher', None)
        if publisher is not None:
            publisher_name = publisher.get('name', '')
            
            # TODO check which place we are going to use 
            self.set_extra('publisher', publisher_name)
            # self.ckan_dataset['publisher'] = publisher_name

            parent_publisher = publisher.get('subOrganizationOf', None)
            if parent_publisher is not None:
                publisher_hierarchy = [publisher_name]
                while parent_publisher:
                    parent_name = parent_publisher.get('name', '')
                    parent_publisher = parent_publisher.get('subOrganizationOf', None)
                    publisher_hierarchy.append(parent_name)

                publisher_hierarchy.reverse()
                publisher_hierarchy = " > ".join(publisher_hierarchy)
                self.set_extra('publisher_hierarchy', publisher_hierarchy)

        # clean all empty unused values (can't pop keys while iterating)
        ckan_dataset_copy = self.ckan_dataset.copy()
        for k, v in self.ckan_dataset.items():
            if v is None:
                ckan_dataset_copy.pop(k)
        self.ckan_dataset = ckan_dataset_copy

        valid = self.validate_final_dataset()
        if valid is None:
            return None

        logger.info('Dataset transformed {} OK'.format(self.original_dataset.get('identifier', '')))
        return ckan_dataset_copy
Beispiel #13
0
    def test_create_harvest_source(self):
        logger.info(f'Creating harvest source from {CKAN_BASE_URL}')
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        try:
            cpa.delete_all_harvest_sources(harvest_type='harvest',
                                           source_type='datajson')
        except Exception as e:
            logger.error(f'Error cleaning previous harvest soures {e}')
            pass

        title = 'Energy JSON test {}'.format(random.randint(1, 999999))
        url = 'http://www.energy.gov/data-{}.json'.format(
            random.randint(1, 999999))
        res = cpa.create_harvest_source(
            title=title,
            url=url,
            owner_org_id=CKAN_ORG_ID,
            source_type='datajson',
            notes='Some tests about local harvesting sources creation',
            frequency='WEEKLY')

        self.assertTrue(res['success'])
        harvest_source = res['result']
        logger.info('Created: {}'.format(res['success']))

        # read it
        res = cpa.show_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res['success'])
        self.assertEqual(harvest_source['url'], url)
        self.assertEqual(harvest_source['title'], title)
        self.assertEqual(harvest_source['type'], 'harvest')
        self.assertEqual(harvest_source['source_type'], 'datajson')

        # search for it
        results = cpa.search_harvest_packages(rows=1000,
                                              harvest_type='harvest',
                                              source_type='datajson')

        created_ok = False

        for datasets in results:
            for dataset in datasets:
                # print('FOUND: {}'.format(dataset['name']))
                if dataset['name'] == harvest_source['name']:
                    created_ok = True
                    logger.info('Found!')
                else:
                    logger.info('Other harvest source: {}'.format(
                        dataset['name']))

        assert created_ok == True

        # create a dataset with this harvest_soure_id
        dataset_title = 'Dataset number {}'.format(random.randint(1, 999999))
        dataset_name = slugify(dataset_title)
        tags = [{'name': 'tag81'}, {'name': 'tag82'}]

        randval = random.randint(1, 999)
        extras = [
            {
                'key': 'harvest_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_source_title',
                'value': harvest_source['title']
            },
            # {'key': 'harvest_object_id', 'value': harvest_source['id']},  # ? not sure
            {
                'key': 'harvest_ng_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_ng_source_title',
                'value': harvest_source['title']
            },
            {
                'key': 'try_a_extra',
                'value': randval
            }
        ]

        package = {
            'name': dataset_name,
            'title': dataset_title,
            'owner_org': CKAN_ORG_ID,
            'tags': tags,
            'extras': extras
        }
        res2 = cpa.create_package(ckan_package=package)
        self.assertTrue(res2['success'])
        logger.info('Package with harvest source: {}'.format(res2['success']))

        # read full dataset
        res3 = cpa.show_package(ckan_package_id_or_name=dataset_name)
        self.assertTrue(res3['success'])
        ckan_dataset = res3['result']
        logger.info(
            'Package with harvest source readed: {}'.format(ckan_dataset))

        assert 'extras' in ckan_dataset
        assert [str(randval)] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'try_a_extra'
        ]
        # my custom ID (not connected to a real harvest ID)
        assert [harvest_source['id']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'harvest_ng_source_id'
        ]

        # check if this package is related to harvest source
        total_datasets_in_source = 0
        datasets_from_source = cpa.search_harvest_packages(
            harvest_source_id=harvest_source['id'])
        connected_ok = False
        for datasets in datasets_from_source:
            for dataset in datasets:
                total_datasets_in_source += 1
                if dataset['name'] == dataset_name:
                    connected_ok = True
                    logger.info('Found!')
                else:
                    # we just expect one dataset
                    error = '{} != {} ------ {}'.format(
                        dataset['name'], dataset_name, dataset)
                    logger.error(error)
                    assert error == False

        assert connected_ok == True
        assert total_datasets_in_source == 1
        logger.info(
            f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}'
        )

        # this fails, harvest process is more complex that just add an extra
        # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id']

        # delete both
        logger.info('Delete CKAN package: {}'.format(ckan_dataset['id']))
        res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id'])
        self.assertTrue(res4['success'])

        logger.info('Delete Harvest source: {}'.format(harvest_source['id']))
        res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res5['success'])
Beispiel #14
0
    def search_harvest_packages(
            self,
            rows=1000,
            method='POST',  # POST work in CKAN 2.8, fails in 2.3
            harvest_source_id=None,  # just one harvest source
            harvest_type=None,  # harvest for harvest sources
            source_type=None):
        """ search harvested packages or harvest sources
            "rows" is the page size.
            You could search for an specific harvest_source_id """

        start = 0
        sort = "metadata_modified desc"

        url = '{}{}'.format(self.base_url, self.package_search_url)
        page = 0
        # TODO check for a real paginated version
        while url:
            page += 1

            params = {'start': start, 'rows': rows}  # , 'sort': sort}
            if harvest_source_id is not None:
                # our new extra is working
                params['fq'] = f'+harvest_ng_source_id:"{harvest_source_id}"'

            elif harvest_type is not None:
                # at my local instance I need this.
                # I not sure why, in another public instances is not needed
                params['fq'] = f'+dataset_type:{harvest_type}'
                if source_type is not None:
                    params[
                        'q'] = f'(type:{harvest_type} source_type:{source_type})'
                else:
                    params['q'] = f'(type:{harvest_type})'

            logger.info(
                f'Searching {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}'
            )

            headers = self.get_request_headers()
            try:
                logger.info(f'Search harvest packages via {method}')
                if method == 'POST':  # depend on CKAN version
                    req = requests.post(url, data=params, headers=headers)
                else:
                    req = requests.get(url, params=params, headers=headers)

            except Exception as e:
                error = 'ERROR Donwloading package list: {} [{}]'.format(
                    url, e)
                raise ValueError(
                    'Failed to get package list at {}'.format(url))

            content = req.content

            if req.status_code >= 400:
                error = ('ERROR searching CKAN package: {}'
                         '\n\t Status code: {}'
                         '\n\t Params: {}'
                         '\n\t content:{}'.format(url, req.status_code, params,
                                                  content))
                logger.error(error)
                raise Exception(error)

            try:
                json_content = json.loads(content)  # check for encoding errors
            except Exception as e:
                error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
                raise ValueError(error)

            if not json_content['success']:
                error = 'API response failed: {}'.format(
                    json_content.get('error', None))
                raise ValueError(error)

            result = json_content['result']
            count_results = result['count']
            sort_results = result['sort']
            facet_results = result['facets']
            results = result['results']
            real_results_count = len(results)
            self.total_packages += real_results_count
            logger.info(f'{real_results_count} results')

            if real_results_count == 0:
                url = None
            else:
                start += rows
                self.package_list += results
                yield (results)
Beispiel #15
0
    def create_package(
            self,
            ckan_package,
            on_duplicated='RAISE',  # if name already exists 'RAISE' 'SKIP' | 'DELETE'
    ):
        """ POST to CKAN API to create a new package/dataset
            ckan_package is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create
            Params:
             - ckan_package: a dict with with a ready-to-save package
             - on_duplicated (str): action to take where the package already exists:
               + RAISE: raise an error
               + SKIP: returns show_package results
                    
               + DELETE: remove the package and try to create again
        """
        url = '{}{}'.format(self.base_url, self.package_create_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        ckan_package_str = json.dumps(ckan_package)

        logger.info(f'POST {url} headers:{headers} data:{ckan_package}')

        try:
            req = requests.post(url, data=ckan_package_str, headers=headers)
        except Exception as e:
            error = 'ERROR creating [POST] CKAN package: {} [{}]'.format(
                url, e)
            raise

        content = req.content
        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            logger.error(error)
            raise

        if req.status_code == 409:
            logger.info(f'409 json_content: {json_content}')
            # another posible [error] = {'owner_org': ['Organization does not exist']}

            # Check for duplicates
            name_errors = json_content['error'][
                'url'] if 'name' in json_content['error'] else []
            dataset_exists = len([
                ne for ne in name_errors if "That URL is already in use" in ne
            ]) > 0

            url_errors = json_content['error']['url'] if 'url' in json_content[
                'error'] else []
            harvest_exists = len([
                ue for ue in url_errors
                if "There already is a Harvest Source for this URL" in ue
            ]) > 0

            is_duplicated = dataset_exists or harvest_exists
            if is_duplicated:
                logger.error(f'Already exists! ACTION: {on_duplicated}')
                if on_duplicated == 'SKIP':
                    # returns {'success': True, 'result': {the package}}
                    res = self.show_package(
                        ckan_package_id_or_name=ckan_package['name'])
                    logger.info(f'Skipped: {res}')
                    return res
                elif on_duplicated == 'DELETE':
                    delr = self.delete_package(
                        ckan_package_id_or_name=ckan_package['name'])
                    if not delr['success']:
                        raise Exception('Failed to delete {}'.format(
                            ckan_package['name']))
                    return self.create_package(ckan_package=ckan_package,
                                               on_duplicated='RAISE')
                elif on_duplicated == 'RAISE':
                    error = ('DUPLICATED CKAN package: {}'
                             '\n\t Status code: {}'
                             '\n\t content:{}'
                             '\n\t Dataset {}'.format(url, req.status_code,
                                                      content, ckan_package))
                    logger.error(error)
                    raise Exception(error)

        if req.status_code >= 400:
            error = ('ERROR creating CKAN package: {}'
                     '\n\t Status code: {}'
                     '\n\t content:{}'
                     '\n\t Dataset {}'.format(url, req.status_code, content,
                                              ckan_package))
            logger.error(error)
            raise Exception(error)

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        logger.info(f'Harvest source created: {json_content}')
        return json_content
Beispiel #16
0
    def search_packages(
        self,
        rows=1000,
        method='POST',  # POST work in CKAN 2.8, fails in 2.3
        search_params={}):  # datajson for
        """ search packages.
            "rows" is the page size.
            """

        start = 0

        url = '{}{}'.format(self.base_url, self.package_search_url)
        page = 0
        # TODO check for a real paginated version
        while url:
            page += 1

            params = {'start': start, 'rows': rows}
            params.update(search_params)
            logger.info(
                f'Searching packages {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}'
            )

            headers = self.get_request_headers()
            try:
                if method == 'POST':  # depend on CKAN version
                    req = requests.post(url, data=params, headers=headers)
                else:
                    req = requests.get(url, params=params, headers=headers)

            except Exception as e:
                error = 'ERROR Donwloading package list: {} [{}]'.format(
                    url, e)
                raise ValueError(
                    'Failed to get package list at {}'.format(url))

            content = req.content

            if req.status_code >= 400:
                error = ('ERROR searching CKAN package: {}'
                         '\n\t Status code: {}'
                         '\n\t Params: {}'
                         '\n\t content:{}'.format(url, req.status_code, params,
                                                  content))
                logger.error(error)
                raise Exception(error)

            try:
                json_content = json.loads(content)  # check for encoding errors
            except Exception as e:
                error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
                raise ValueError(error)

            if not json_content['success']:
                error = 'API response failed: {}'.format(
                    json_content.get('error', None))
                raise ValueError(error)

            result = json_content['result']
            results = result['results']
            real_results_count = len(results)
            self.total_packages += real_results_count
            logger.info(f'{real_results_count} results')

            if real_results_count == 0:
                url = None
            else:
                start += rows
                self.package_list += results
                logger.debug(f'datasets found: {results}')
                yield (results)