Ejemplo n.º 1
0
    def get_user_info(self, user_id):
        """ GET to CKAN API to get list of admins
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.get.user_show
        """
        url = '{}{}?id={}'.format(self.base_url, self.user_show_url, user_id)
        headers = self.get_request_headers(include_api_key=True)
        logger.info(f'GET {url} headers:{headers}')
        try:
            req = requests.get(url, headers=headers)
        except Exception as e:
            error = 'ERROR getting users information: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR getting users information: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from users information {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Ejemplo n.º 2
0
    def show_package(self, ckan_package_id_or_name):
        """ GET to CKAN API to show a package/dataset """

        url = '{}{}'.format(self.base_url, self.package_show_url)
        headers = self.get_request_headers(include_api_key=True)
        data = {'id': ckan_package_id_or_name}
        logger.info(f'GET {url} headers:{headers} data:{data}')
        try:
            req = requests.get(url, params=data, headers=headers)
        except Exception as e:
            error = 'ERROR showing CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content
        if req.status_code >= 400:
            error = 'ERROR showing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        content = req.content

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from show_package: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Ejemplo n.º 3
0
    def validate(self, validator_schema):
        """ Validate the data.json suorce 
            We need to know which validator to use 
            and two jsonschema definition file
            at ./validation/schemas/{validator_schema}
                /catalog.json: definition for full data.json 
                /dataset.json: definition for each dataset
            """
        if validator_schema not in VALID_DATAJSON_SCHEMAS:
            raise Exception(f'Unknown validator_schema {validator_schema}')

        self.schema_version = VALID_DATAJSON_SCHEMAS[validator_schema]

        # check to see if the original json is from a dictionary which will indicate it is a test - we only need to check for encoding errors on real harvests
        if self.raw_data_json is not None:
            try:
                self.data_json = json.loads(self.raw_data_json)
            except Exception as e:
                error = 'ERROR parsing JSON: {}. Data: {}'.format(
                    e, self.raw_data_json)
                self.errors.append(error)
                logger.error(error)
                return False

        error = None

        if self.data_json is None:
            error = 'No data json available'
        elif type(self.data_json) == list:
            error = 'Data.json is a simple list. We expect a dict'

        if error is not None:
            self.errors.append(error)
            logger.error(error)
            return False

        # validate with json schema
        schemas_folder = os.path.join(os.path.dirname(__file__), 'validation',
                                      'schemas', validator_schema)
        catalog_schema = os.path.join(schemas_folder, 'catalog.json')

        if os.path.isfile(catalog_schema):
            f = open(catalog_schema, 'r')
            schema = json.load(f)

            try:
                jss.validate(instance=self.data_json, schema=schema)
            except Exception as e:
                error = "Error validating catalog: {} with schema {}".format(
                    e, schema)
                self.errors.append(error)
                return False

        return True
Ejemplo n.º 4
0
    def fetch(self, clean_url=True, timeout=120):
        # connect to csw source
        url = self.get_cleaned_url() if clean_url else self.url
        try:
            self.csw = CatalogueServiceWeb(url, timeout=timeout)
        except Exception as e:
            error = f'Error connection CSW: {e}'
            self.errors.append(error)
            logger.error(error)
            raise

        self.read_csw_info()
Ejemplo n.º 5
0
def clean_tags(tags):
    ret = []
    pattern = re.compile(r'[^A-Za-z0-9\s_\-!?]+')

    for tag in tags:
        tag = pattern.sub('', tag).strip()
        if len(tag) > settings.MAX_TAG_NAME_LENGTH:
            logger.error('tag is long, cutting: {}'.format(tag))
            tag = tag[:settings.MAX_TAG_NAME_LENGTH]
        elif len(tag) < settings.MIN_TAG_NAME_LENGTH:
            logger.error('tag is short: {}'.format(tag))
            tag += '_' * (settings.MIN_TAG_NAME_LENGTH - len(tag))
        if tag != '':
            ret.append(tag.lower().replace(' ', '-'))  # copyin CKAN behaviour
    return ret
Ejemplo n.º 6
0
    def _is_wms(self, url):
        '''
        Checks if the provided URL actually points to a Web Map Service.
        Uses owslib WMS reader to parse the response.
        '''

        try:
            capabilities_url = wms.WMSCapabilitiesReader().capabilities_url(url)
            res = requests.get(capabilities_url, timeout=10)
            xml = res.text

            s = wms.WebMapService(url, xml=xml)
            raise Exception('is_wms: {}'.format(s.contents))
            return isinstance(s.contents, dict) and s.contents != {}
        except Exception as e:
            logger.error('WMS check for %s failed with exception: %s' % (url, str(e)))
        return False
Ejemplo n.º 7
0
    def show_organization(
            self,
            organization_id_or_name,
            method='POST'):  # troubles using 2.3 and 2.8 CKAN versions):
        """ GET to CKAN API to show a organization """

        url = '{}{}'.format(self.base_url, self.organization_show_url)
        headers = self.get_request_headers()
        data = {'id': organization_id_or_name}
        logger.info(f'POST {url} headers:{headers} data:{data}')
        try:
            if method == 'POST':
                req = requests.post(url, data=data, headers=headers)
            else:
                req = requests.get(url, params=data, headers=headers)
        except Exception as e:
            error = 'ERROR showing organization: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400 and req.status_code != 404:
            error = 'ERROR showing organization: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from show_organization: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Ejemplo n.º 8
0
    def fetch(self, timeout=30):
        """ download de data.json file """
        logger.info(f'Fetching data from {self.url}')
        if self.url is None:
            error = "No URL defined"
            self.errors.append(error)
            logger.error(error)
            raise Exception(error)

        try:
            req = requests.get(self.url, timeout=timeout)
        except Exception as e:
            error = 'ERROR Donwloading data: {} [{}]'.format(self.url, e)
            self.errors.append(error)
            logger.error(error)
            raise

        logger.info(f'Data fetched status {req.status_code}')
        if req.status_code >= 400:
            error = '{} HTTP error: {}'.format(self.url, req.status_code)
            self.errors.append(error)
            logger.error(error)
            raise Exception(error)

        logger.info(f'Data fetched OK')
        self.raw_data_json = req.content
Ejemplo n.º 9
0
    def delete_package(self, ckan_package_id_or_name):
        """ POST to CKAN API to delete a new package/dataset
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.delete.package_delete
        """
        url = '{}{}'.format(self.base_url, self.package_delete_url)
        headers = self.get_request_headers(include_api_key=True)
        data = {'id': ckan_package_id_or_name}
        logger.error(f'POST {url} headers:{headers} data:{data}')
        try:
            req = requests.post(url, data=data, headers=headers)
        except Exception as e:
            error = 'ERROR deleting CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR deleting CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from delete_package: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Ejemplo n.º 10
0
    def validate(self, validator_schema):

        schemas_folder = os.path.join(os.path.dirname(__file__), 'validation',
                                      'schemas', validator_schema)
        dataset_schema = os.path.join(schemas_folder, 'dataset.json')
        if os.path.isfile(dataset_schema):
            f = open(dataset_schema, 'r')
            schema = json.load(f)

            try:
                jss.validate(self.data, schema=schema)
            except Exception as e:
                error = "Error validating dataset: {}".format(e)
                self.errors.append(error)
                logger.error(error)
                return False

        if validator_schema in ['federal-v1.1', 'federal']:
            if not self.validate_bureau_code():
                return False

        return True
Ejemplo n.º 11
0
    def update_package(self, ckan_package):
        """ POST to CKAN API to update a package/dataset
            ckan_package is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.update.package_update
        """
        url = '{}{}'.format(self.base_url, self.package_update_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        ckan_package = json.dumps(ckan_package)

        logger.info(f'POST {url} headers:{headers} data:{ckan_package}')
        try:
            req = requests.post(url, data=ckan_package, headers=headers)
        except Exception as e:
            error = 'ERROR creating CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR updateing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Ejemplo n.º 12
0
    def create_organization(self, organization, check_if_exists=True):
        """ POST to CKAN API to create a new organization
            organization is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.organization_create
        """
        logger.info(f'**** Creating Organization {organization}')
        if check_if_exists:
            logger.info(f'Exists Organization? {organization}')
            res = self.show_organization(
                organization_id_or_name=organization['name'])
            if res['success']:
                # do not create
                logger.info(f'Avoid create Organization {organization}')
                return res

        url = '{}{}'.format(self.base_url, self.organization_create_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        organization = json.dumps(organization)

        logger.info(f'POST {url} headers:{headers} data:{organization}')

        try:
            req = requests.post(url, data=organization, headers=headers)
        except Exception as e:
            error = 'ERROR creating [POST] organization: {} [{}]'.format(
                url, e)
            raise

        content = req.content

        if req.status_code >= 400:

            error = ('ERROR creating [STATUS] organization: {}'
                     '\n\t Status code: {}'
                     '\n\t content:{}'
                     '\n\t Dataset {}'.format(url, req.status_code, content,
                                              organization))
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            logger.error(error)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Ejemplo n.º 13
0
    def test_create_harvest_source(self):
        logger.info(f'Creating harvest source from {CKAN_BASE_URL}')
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        try:
            cpa.delete_all_harvest_sources(harvest_type='harvest',
                                           source_type='datajson')
        except Exception as e:
            logger.error(f'Error cleaning previous harvest soures {e}')
            pass

        title = 'Energy JSON test {}'.format(random.randint(1, 999999))
        url = 'http://www.energy.gov/data-{}.json'.format(
            random.randint(1, 999999))
        res = cpa.create_harvest_source(
            title=title,
            url=url,
            owner_org_id=CKAN_ORG_ID,
            source_type='datajson',
            notes='Some tests about local harvesting sources creation',
            frequency='WEEKLY')

        self.assertTrue(res['success'])
        harvest_source = res['result']
        logger.info('Created: {}'.format(res['success']))

        # read it
        res = cpa.show_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res['success'])
        self.assertEqual(harvest_source['url'], url)
        self.assertEqual(harvest_source['title'], title)
        self.assertEqual(harvest_source['type'], 'harvest')
        self.assertEqual(harvest_source['source_type'], 'datajson')

        # search for it
        results = cpa.search_harvest_packages(rows=1000,
                                              harvest_type='harvest',
                                              source_type='datajson')

        created_ok = False

        for datasets in results:
            for dataset in datasets:
                # print('FOUND: {}'.format(dataset['name']))
                if dataset['name'] == harvest_source['name']:
                    created_ok = True
                    logger.info('Found!')
                else:
                    logger.info('Other harvest source: {}'.format(
                        dataset['name']))

        assert created_ok == True

        # create a dataset with this harvest_soure_id
        dataset_title = 'Dataset number {}'.format(random.randint(1, 999999))
        dataset_name = slugify(dataset_title)
        tags = [{'name': 'tag81'}, {'name': 'tag82'}]

        randval = random.randint(1, 999)
        extras = [
            {
                'key': 'harvest_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_source_title',
                'value': harvest_source['title']
            },
            # {'key': 'harvest_object_id', 'value': harvest_source['id']},  # ? not sure
            {
                'key': 'harvest_ng_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_ng_source_title',
                'value': harvest_source['title']
            },
            {
                'key': 'try_a_extra',
                'value': randval
            }
        ]

        package = {
            'name': dataset_name,
            'title': dataset_title,
            'owner_org': CKAN_ORG_ID,
            'tags': tags,
            'extras': extras
        }
        res2 = cpa.create_package(ckan_package=package)
        self.assertTrue(res2['success'])
        logger.info('Package with harvest source: {}'.format(res2['success']))

        # read full dataset
        res3 = cpa.show_package(ckan_package_id_or_name=dataset_name)
        self.assertTrue(res3['success'])
        ckan_dataset = res3['result']
        logger.info(
            'Package with harvest source readed: {}'.format(ckan_dataset))

        assert 'extras' in ckan_dataset
        assert [str(randval)] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'try_a_extra'
        ]
        # my custom ID (not connected to a real harvest ID)
        assert [harvest_source['id']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'harvest_ng_source_id'
        ]

        # check if this package is related to harvest source
        total_datasets_in_source = 0
        datasets_from_source = cpa.search_harvest_packages(
            harvest_source_id=harvest_source['id'])
        connected_ok = False
        for datasets in datasets_from_source:
            for dataset in datasets:
                total_datasets_in_source += 1
                if dataset['name'] == dataset_name:
                    connected_ok = True
                    logger.info('Found!')
                else:
                    # we just expect one dataset
                    error = '{} != {} ------ {}'.format(
                        dataset['name'], dataset_name, dataset)
                    logger.error(error)
                    assert error == False

        assert connected_ok == True
        assert total_datasets_in_source == 1
        logger.info(
            f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}'
        )

        # this fails, harvest process is more complex that just add an extra
        # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id']

        # delete both
        logger.info('Delete CKAN package: {}'.format(ckan_dataset['id']))
        res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id'])
        self.assertTrue(res4['success'])

        logger.info('Delete Harvest source: {}'.format(harvest_source['id']))
        res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res5['success'])
Ejemplo n.º 14
0
    def create_package(
            self,
            ckan_package,
            on_duplicated='RAISE',  # if name already exists 'RAISE' 'SKIP' | 'DELETE'
    ):
        """ POST to CKAN API to create a new package/dataset
            ckan_package is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create
            Params:
             - ckan_package: a dict with with a ready-to-save package
             - on_duplicated (str): action to take where the package already exists:
               + RAISE: raise an error
               + SKIP: returns show_package results
                    
               + DELETE: remove the package and try to create again
        """
        url = '{}{}'.format(self.base_url, self.package_create_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        ckan_package_str = json.dumps(ckan_package)

        logger.info(f'POST {url} headers:{headers} data:{ckan_package}')

        try:
            req = requests.post(url, data=ckan_package_str, headers=headers)
        except Exception as e:
            error = 'ERROR creating [POST] CKAN package: {} [{}]'.format(
                url, e)
            raise

        content = req.content
        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            logger.error(error)
            raise

        if req.status_code == 409:
            logger.info(f'409 json_content: {json_content}')
            # another posible [error] = {'owner_org': ['Organization does not exist']}

            # Check for duplicates
            name_errors = json_content['error'][
                'url'] if 'name' in json_content['error'] else []
            dataset_exists = len([
                ne for ne in name_errors if "That URL is already in use" in ne
            ]) > 0

            url_errors = json_content['error']['url'] if 'url' in json_content[
                'error'] else []
            harvest_exists = len([
                ue for ue in url_errors
                if "There already is a Harvest Source for this URL" in ue
            ]) > 0

            is_duplicated = dataset_exists or harvest_exists
            if is_duplicated:
                logger.error(f'Already exists! ACTION: {on_duplicated}')
                if on_duplicated == 'SKIP':
                    # returns {'success': True, 'result': {the package}}
                    res = self.show_package(
                        ckan_package_id_or_name=ckan_package['name'])
                    logger.info(f'Skipped: {res}')
                    return res
                elif on_duplicated == 'DELETE':
                    delr = self.delete_package(
                        ckan_package_id_or_name=ckan_package['name'])
                    if not delr['success']:
                        raise Exception('Failed to delete {}'.format(
                            ckan_package['name']))
                    return self.create_package(ckan_package=ckan_package,
                                               on_duplicated='RAISE')
                elif on_duplicated == 'RAISE':
                    error = ('DUPLICATED CKAN package: {}'
                             '\n\t Status code: {}'
                             '\n\t content:{}'
                             '\n\t Dataset {}'.format(url, req.status_code,
                                                      content, ckan_package))
                    logger.error(error)
                    raise Exception(error)

        if req.status_code >= 400:
            error = ('ERROR creating CKAN package: {}'
                     '\n\t Status code: {}'
                     '\n\t content:{}'
                     '\n\t Dataset {}'.format(url, req.status_code, content,
                                              ckan_package))
            logger.error(error)
            raise Exception(error)

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        logger.info(f'Harvest source created: {json_content}')
        return json_content
Ejemplo n.º 15
0
    def search_packages(
        self,
        rows=1000,
        method='POST',  # POST work in CKAN 2.8, fails in 2.3
        search_params={}):  # datajson for
        """ search packages.
            "rows" is the page size.
            """

        start = 0

        url = '{}{}'.format(self.base_url, self.package_search_url)
        page = 0
        # TODO check for a real paginated version
        while url:
            page += 1

            params = {'start': start, 'rows': rows}
            params.update(search_params)
            logger.info(
                f'Searching packages {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}'
            )

            headers = self.get_request_headers()
            try:
                if method == 'POST':  # depend on CKAN version
                    req = requests.post(url, data=params, headers=headers)
                else:
                    req = requests.get(url, params=params, headers=headers)

            except Exception as e:
                error = 'ERROR Donwloading package list: {} [{}]'.format(
                    url, e)
                raise ValueError(
                    'Failed to get package list at {}'.format(url))

            content = req.content

            if req.status_code >= 400:
                error = ('ERROR searching CKAN package: {}'
                         '\n\t Status code: {}'
                         '\n\t Params: {}'
                         '\n\t content:{}'.format(url, req.status_code, params,
                                                  content))
                logger.error(error)
                raise Exception(error)

            try:
                json_content = json.loads(content)  # check for encoding errors
            except Exception as e:
                error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
                raise ValueError(error)

            if not json_content['success']:
                error = 'API response failed: {}'.format(
                    json_content.get('error', None))
                raise ValueError(error)

            result = json_content['result']
            results = result['results']
            real_results_count = len(results)
            self.total_packages += real_results_count
            logger.info(f'{real_results_count} results')

            if real_results_count == 0:
                url = None
            else:
                start += rows
                self.package_list += results
                logger.debug(f'datasets found: {results}')
                yield (results)
Ejemplo n.º 16
0
    def search_harvest_packages(
            self,
            rows=1000,
            method='POST',  # POST work in CKAN 2.8, fails in 2.3
            harvest_source_id=None,  # just one harvest source
            harvest_type=None,  # harvest for harvest sources
            source_type=None):
        """ search harvested packages or harvest sources
            "rows" is the page size.
            You could search for an specific harvest_source_id """

        start = 0
        sort = "metadata_modified desc"

        url = '{}{}'.format(self.base_url, self.package_search_url)
        page = 0
        # TODO check for a real paginated version
        while url:
            page += 1

            params = {'start': start, 'rows': rows}  # , 'sort': sort}
            if harvest_source_id is not None:
                # our new extra is working
                params['fq'] = f'+harvest_ng_source_id:"{harvest_source_id}"'

            elif harvest_type is not None:
                # at my local instance I need this.
                # I not sure why, in another public instances is not needed
                params['fq'] = f'+dataset_type:{harvest_type}'
                if source_type is not None:
                    params[
                        'q'] = f'(type:{harvest_type} source_type:{source_type})'
                else:
                    params['q'] = f'(type:{harvest_type})'

            logger.info(
                f'Searching {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}'
            )

            headers = self.get_request_headers()
            try:
                logger.info(f'Search harvest packages via {method}')
                if method == 'POST':  # depend on CKAN version
                    req = requests.post(url, data=params, headers=headers)
                else:
                    req = requests.get(url, params=params, headers=headers)

            except Exception as e:
                error = 'ERROR Donwloading package list: {} [{}]'.format(
                    url, e)
                raise ValueError(
                    'Failed to get package list at {}'.format(url))

            content = req.content

            if req.status_code >= 400:
                error = ('ERROR searching CKAN package: {}'
                         '\n\t Status code: {}'
                         '\n\t Params: {}'
                         '\n\t content:{}'.format(url, req.status_code, params,
                                                  content))
                logger.error(error)
                raise Exception(error)

            try:
                json_content = json.loads(content)  # check for encoding errors
            except Exception as e:
                error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
                raise ValueError(error)

            if not json_content['success']:
                error = 'API response failed: {}'.format(
                    json_content.get('error', None))
                raise ValueError(error)

            result = json_content['result']
            count_results = result['count']
            sort_results = result['sort']
            facet_results = result['facets']
            results = result['results']
            real_results_count = len(results)
            self.total_packages += real_results_count
            logger.info(f'{real_results_count} results')

            if real_results_count == 0:
                url = None
            else:
                start += rows
                self.package_list += results
                yield (results)
Ejemplo n.º 17
0
    def transform_to_ckan_resource(self):

        valid, error = self.validate_origin_distribution()
        if not valid:
            raise Exception(f'Error validating origin resource/record: {error}')

        original_resource = self.original_resource
        ckan_resource = self.get_base_ckan_resource()

        resource = None
        if original_resource['type'] == 'resource_locator':
            """
            example_data = {
                'resource-locator': [{
                    'url': 'http://geonode.state.gov/geoserver/wms?layers=geonode%3ASyria_IDPSites_2015Jun11_HIU_USDoS&width=373&bbox=35.748%2C32.583%2C38.674%2C36.894&service=WMS&format=image%2Fjpeg&srs=EPSG%3A4326&request=GetMap&height=550',
                    'function': '',
                    'name': 'Syria_IDPSites_2015Jun11_HIU_USDoS',
                    'description': 'Syria_IDPSites_2015Jun11_HIU_USDoS (JPEG Format)',
                    'protocol': 'WWW:DOWNLOAD-1.0-http--download'
                }]
            }
            """
            resource_locator = original_resource['data']
            url = resource_locator.get('url', '').strip()
            if url:
                resource = {}
                format_from_url = self.guess_resource_format(url)
                resource['format'] = format_from_url
                cfg = True  # TODO config.get('ckanext.spatial.harvest.validate_wms', False)
                if resource['format'] == 'wms' and cfg:
                    # Check if the service is a view service
                    test_url = url.split('?')[0] if '?' in url else url
                    if self._is_wms(test_url):
                        resource['verified'] = True
                        resource['verified_date'] = datetime.now().isoformat()

                resource.update(
                    {
                        'url': url,
                        'name': resource_locator.get('name') or 'Unnamed resource',
                        'description': resource_locator.get('description') or  '',
                        'resource_locator_protocol': resource_locator.get('protocol') or '',
                        'resource_locator_function': resource_locator.get('function') or '',
                    })

        elif original_resource['type'] == 'resource_locator_group_data_format':
            resource_locator_group_data_format = original_resource['data']
            """ sample data
            ({
                'resource-locator': [{
                    'url': 'http://geonode.state.gov/geoserver/wms?layers=geonode%3ASyria_IDPSites_2015Jun11_HIU_USDoS&width=373&bbox=35.748%2C32.583%2C38.674%2C36.894&service=WMS&format=image%2Fjpeg&srs=EPSG%3A4326&request=GetMap&height=550',
                    'function': '',
                    'name': 'Syria_IDPSites_2015Jun11_HIU_USDoS',
                    'description': 'Syria_IDPSites_2015Jun11_HIU_USDoS (JPEG Format)',
                    'protocol': 'WWW:DOWNLOAD-1.0-http--download'
                }]
            }, None)
            """

            resource_locator_group = resource_locator_group_data_format[0]
            data_format = resource_locator_group_data_format[1]
            for resource_locator in resource_locator_group['resource-locator']:
                url = resource_locator.get('url', None)
                if url is not None:
                    resource = {}
                    format_from_url = self.guess_resource_format(url)
                    resource['format'] = format_from_url if format_from_url else data_format
                    cfg = True  # TODO config.get('ckanext.spatial.harvest.validate_wms', False)
                    if resource['format'] == 'wms' and cfg:
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now().isoformat()

                    resource.update(
                        {
                            'url': url,
                            'name': resource_locator.get('name') or 'Unnamed resource',
                            'description': resource_locator.get('description') or  '',
                            'resource_locator_protocol': resource_locator.get('protocol') or '',
                            'resource_locator_function': resource_locator.get('function') or '',
                        })

        if resource is None:
            logger.error(f'Unable to parse resource: {original_resource}')
            return None

        ckan_resource.update(**resource)
        valid, error = self.validate_final_resource(ckan_resource)
        if not valid:
            raise Exception(f'Error validating final resource/distribution: {error}')

        return ckan_resource