Example #1
0
    def get_user_info(self, user_id):
        """ GET to CKAN API to get list of admins
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.get.user_show
        """
        url = '{}{}?id={}'.format(self.base_url, self.user_show_url, user_id)
        headers = self.get_request_headers(include_api_key=True)
        logger.info(f'GET {url} headers:{headers}')
        try:
            req = requests.get(url, headers=headers)
        except Exception as e:
            error = 'ERROR getting users information: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR getting users information: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from users information {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #2
0
def clean_duplicated_identifiers(rows):
    """ clean duplicated datasets identifiers on data.json source """

    logger.info('Cleaning duplicates')
    unique_identifiers = []
    duplicates = []
    processed = 0
    # resource = rows.res
    # logger.error('Rows from resource {}'.format(resource.name))

    for row in rows:
        if row['identifier'] not in unique_identifiers:
            unique_identifiers.append(row['identifier'])
            yield (row)
            processed += 1
        else:
            duplicates.append(row['identifier'])
            row['is_duplicate'] = 'True'
            yield (row)
            # do not log all duplicates. Sometimes they are too many.
            if len(duplicates) < 10:
                logger.error('Duplicated {}'.format(row['identifier']))
            elif len(duplicates) == 10:
                logger.error('... more duplicates not shown')
    logger.info('{} duplicates deleted. {} OK'.format(len(duplicates),
                                                      processed))
Example #3
0
    def show_package(self, ckan_package_id_or_name):
        """ GET to CKAN API to show a package/dataset """

        url = '{}{}'.format(self.base_url, self.package_show_url)
        headers = self.get_request_headers()
        data = {'id': ckan_package_id_or_name}
        logger.info(f'POST {url} headers:{headers} data:{data}')
        try:
            req = requests.get(url, params=data, headers=headers)
        except Exception as e:
            error = 'ERROR showing CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content
        if req.status_code >= 400:
            error = 'ERROR showing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        content = req.content

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from show_package: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #4
0
def clean_duplicated_identifiers(rows):
    unique_identifiers = []
    duplicates = []
    processed = 0
    for row in rows:
        if row['identifier'] not in unique_identifiers:
            unique_identifiers.append(row['identifier'])
            yield(row)
            processed += 1
        else:
            duplicates.append(row['identifier'])
            logger.error('Duplicated {}'.format(row['identifier']))
    logger.info('{} duplicates deleted. {} OK'.format(len(duplicates), processed))
    def process_results(self):

        # analyze results

        actions = {}  # create | delete | update
        validation_errors = []
        action_errors = []
        action_warnings = []
        # print(f'Result: {self.results}')
        if type(self.results) != list:
            logger.error(f'Unexpected results: {self.results}')
            return False

        for result in self.results:

            # print(f'Result: {result}')
            comparison_results = result.get('comparison_results', None)
            if comparison_results is None:
                # this is bad. This source is broken
                return False
            action = comparison_results['action']
            if action not in actions.keys():
                actions[action] = {'total': 0, 'success': 0, 'fails': 0}
            actions[action]['total'] += 1

            if action in ['create', 'update']:  # delete has no new_data
                if len(comparison_results['new_data'].get('validation_errors', [])) > 0:
                    validation_errors += comparison_results['new_data']['validation_errors']

            action_results = comparison_results.get('action_results', {})
            success = action_results.get('success', False)
            if success:
                actions[action]['success'] += 1
            else:
                actions[action]['fails'] += 1

            action_warnings += action_results.get('warnings', [])
            action_errors += action_results.get('errors', [])

        self.final_results['actions'] = actions
        self.final_results['validation_errors'] = validation_errors
        self.final_results['action_warnings'] = action_warnings
        self.final_results['action_errors'] = action_errors

        return True
Example #6
0
def get_data_json_from_url(url):
    logger.info(f'Geting data.json from {url}')

    datajson = DataJSON()
    datajson.url = url

    ret, info = datajson.download_data_json(timeout=90)
    if not ret:
        error = 'Error getting data: {}'.format(info)
        logger.error(error)
        raise Exception(error)
    logger.info('Downloaded OK')

    ret, info = datajson.load_data_json()
    if not ret:
        datajson.save_validation_errors(
            path=config.get_datajson_validation_errors_path())
        logger.error(datajson.validation_errors)
        try:
            build_validation_error_email()
        except Exception as e:
            logger.error('Error sending validation email: {}'.format(e))
        raise Exception(datajson.validation_errors)

    logger.info('JSON OK')
    ret, info = datajson.validate_json()
    if not ret:
        logger.error(
            'Error validating data: {}\n----------------\n'.format(info))
        # continue  # USE invalid too
        logger.info('Validate FAILED: {} datasets'.format(
            len(datajson.datasets)))
    else:
        logger.info('Validate OK: {} datasets'.format(len(datajson.datasets)))

    # TODO move this as a DataJson function and add it to a validate function validate_data_json(data_json['dataset'])

    logger.info('VALID JSON, {} datasets found'.format(len(datajson.datasets)))

    # save data.json
    datajson.save_data_json(path=config.get_datajson_cache_path())
    # save headers errors
    datajson.save_validation_errors(
        path=config.get_datajson_validation_errors_path())

    # the real dataset list

    if config.LIMIT_DATASETS > 0:
        datajson.datasets = datajson.datasets[:config.LIMIT_DATASETS]
    for dataset in datajson.datasets:
        # add headers (previously called catalog_values)
        dataset['headers'] = datajson.headers
        yield (dataset)
Example #7
0
    def delete_package(self, ckan_package_id_or_name):
        """ POST to CKAN API to delete a new package/dataset
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.delete.package_delete
        """
        url = '{}{}'.format(self.base_url, self.package_delete_url)
        headers = self.get_request_headers(include_api_key=True)
        data = {'id': ckan_package_id_or_name}
        logger.error(f'POST {url} headers:{headers} data:{data}')
        try:
            req = requests.post(url, data=data, headers=headers)
        except Exception as e:
            error = 'ERROR deleting CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR deleting CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from delete_package: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #8
0
    def show_organization(
            self,
            organization_id_or_name,
            method='POST'):  # troubles using 2.3 and 2.8 CKAN versions):
        """ GET to CKAN API to show a organization """

        url = '{}{}'.format(self.base_url, self.organization_show_url)
        headers = self.get_request_headers()
        data = {'id': organization_id_or_name}
        logger.info(f'POST {url} headers:{headers} data:{data}')
        try:
            if method == 'POST':
                req = requests.post(url, data=data, headers=headers)
            else:
                req = requests.get(url, params=data, headers=headers)
        except Exception as e:
            error = 'ERROR showing organization: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400 and req.status_code != 404:
            error = 'ERROR showing organization: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data from show_organization: {} [{}]'.format(
                content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #9
0
def get_data_json_from_url(url):
    datajson = DataJSON()
    datajson.url = url

    ret, info = datajson.download_data_json(timeout=90)
    if not ret:
        error = 'Error getting data: {}'.format(info)
        logger.error(error)
        raise Exception(error)
    logger.info('Downloaded OK')

    ret, info = datajson.load_data_json()
    if not ret:
        error = 'Error loading JSON data: {}'.format(info)
        logger.error(error)
        raise Exception(error)

    logger.info('JSON OK')
    ret, info = datajson.validate_json()
    if not ret:
        logger.error('Error validating data: {}\n----------------\n'.format(info))
        # continue  # USE invalid too
        logger.info('Validate FAILED: {} datasets'.format(len(datajson.datasets)))
    else:
        logger.info('Validate OK: {} datasets'.format(len(datajson.datasets)))

    # logger.debug('JSONSchema: {}'.format(json.dumps(datajson.schema.json_content, indent=4)))
    return datajson
Example #10
0
def get_data_json_from_file(data_json_path):
    datajson = DataJSON()

    ret, info = datajson.read_local_data_json(data_json_path=data_json_path)

    ret, info = datajson.load_data_json()
    if not ret:
        error = 'Error loading JSON data: {}'.format(info)
        logger.error(error)
        raise Exception(error)

    logger.info('JSON OK')
    ret, errors = datajson.validate_json()
    if not ret:
        total_errors = len(errors)
        logger.error('{} Errors validating data'.format(total_errors))
        error = errors[0]
        if len(error) > 70: # too long and vervose errors
            error = error[:70]
        logger.error('Error 1/{} validating data:\n\t{}'.format(total_errors, error))
        # continue  # USE invalid too
        logger.info('Validate FAILED: {} datasets'.format(len(datajson.datasets)))
    else:
        logger.info('Validate OK: {} datasets'.format(len(datajson.datasets)))

    # logger.debug('JSONSchema: {}'.format(json.dumps(datajson.schema.json_content, indent=4)))

    return datajson
    def process_all(self):

        logger.info(f'Inspecting {self.base_folder} folder')
        for subdir, dirs, files in os.walk(self.base_folder):
            for name in dirs:
                if name == 'harvest_sources':
                    continue
                logger.info(f'Processing {name} folder')
                self.summary_data['harvest_sources_readed'] += 1

                hs = HarvestedSource(name=name)
                ret = hs.process_results()
                if not ret:
                    self.summary_data['harvest_sources_failed'] += 1
                    continue
                hs.render_template(save=True)

                data = hs.get_json_data()
                self.all_data.append(data)

                if type(data['data_json']) == list:
                    datasets = []
                    logger.error(f'{name}: Data JSON Source is a list. Must be a dict')
                if type(data['data_json']) == dict:
                    datasets = data['data_json'].get('dataset', [])
                if len(datasets) == 0:
                    logger.error(f'Source with 0 datasets {name}')
                self.summary_data['total_data_json_datasets'] += len(datasets)
                logger.info(' - Total datasets: {}'.format(self.summary_data['total_data_json_datasets']))

        harvest_sources_readed = self.summary_data['harvest_sources_readed']
        harvest_sources_failed = self.summary_data['harvest_sources_failed']
        total_data_json_datasets = self.summary_data['total_data_json_datasets']
        logger.info('''**************
                        Harvest sources readed: {}
                        Harvest sources failed: {}
                        Total datasets: {}'''.format(harvest_sources_readed,
                                                     harvest_sources_failed,
                                                     total_data_json_datasets))
Example #12
0
    def create_organization(self, organization, check_if_exists=True):
        """ POST to CKAN API to create a new organization
            organization is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.organization_create
        """
        logger.info(f'**** Creating Organization {organization}')
        if check_if_exists:
            logger.info(f'Exists Organization? {organization}')
            res = self.show_organization(
                organization_id_or_name=organization['name'])
            if res['success']:
                # do not create
                logger.info(f'Avoid create Organization {organization}')
                return res

        url = '{}{}'.format(self.base_url, self.organization_create_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        organization = json.dumps(organization)

        logger.info(f'POST {url} headers:{headers} data:{organization}')

        try:
            req = requests.post(url, data=organization, headers=headers)
        except Exception as e:
            error = 'ERROR creating [POST] organization: {} [{}]'.format(
                url, e)
            raise

        content = req.content

        if req.status_code >= 400:

            error = ('ERROR creating [STATUS] organization: {}'
                     '\n\t Status code: {}'
                     '\n\t content:{}'
                     '\n\t Dataset {}'.format(url, req.status_code, content,
                                              organization))
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            logger.error(error)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #13
0
    def update_package(self, ckan_package):
        """ POST to CKAN API to update a package/dataset
            ckan_package is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.update.package_update
        """
        url = '{}{}'.format(self.base_url, self.package_update_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        ckan_package = json.dumps(ckan_package)

        logger.error(f'POST {url} headers:{headers} data:{ckan_package}')
        try:
            req = requests.post(url, data=ckan_package, headers=headers)
        except Exception as e:
            error = 'ERROR creating CKAN package: {} [{}]'.format(url, e)
            raise

        content = req.content

        if req.status_code >= 400:
            error = 'ERROR updateing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format(
                url, req.status_code, content)
            logger.error(error)
            raise Exception(error)

        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            raise

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #14
0
def compare_resources(rows):
    """ read the previous resource (CKAN API results)
        Yield any comparison result
        """

    res_name = rows.res.name if hasattr(rows, 'res') else 'Fake res testing'
    logger.info(f'Rows from resource {res_name}')

    data_packages_path = config.get_data_packages_folder_path()
    default_tzinfo_for_naives_dates = pytz.UTC

    # Calculate minimum statistics
    total = 0

    no_extras = 0
    no_identifier_key_found = 0
    deleted = 0
    found_update = 0
    found_not_update = 0

    sample_row = None
    for row in rows:
        total += 1
        # logger.info(f'Row: {total}')
        # check for identifier
        ckan_id = row['id']
        extras = row.get('extras', False)
        if not extras:
            # TODO learn why.
            logger.error(f'No extras! dataset: {ckan_id}')
            result = {
                'action': 'error',
                'ckan_id': ckan_id,
                'new_data': None,
                'reason': 'The CKAN dataset does not '
                'have the "extras" property'
            }
            row.update({'comparison_results': result})
            yield row
            no_extras += 1
            continue

        identifier = None
        for extra in extras:
            if extra['key'] == 'identifier':
                identifier = extra['value']

        if identifier is None:
            logger.error('No identifier '
                         '(extras[].key.identifier not exists). '
                         'Dataset.id: {}'.format(ckan_id))

            no_identifier_key_found += 1
            result = {
                'action': 'error',
                'ckan_id': ckan_id,
                'new_data': None,
                'reason': 'The CKAN dataset does not have an "identifier"'
            }
            row.update({'comparison_results': result})
            yield row
            continue

        # was parent in the previous harvest
        # if extras.get('collection_metadata', None) is not None:

        encoded_identifier = encode_identifier(identifier)
        expected_filename = f'data-json-{encoded_identifier}.json'
        expected_path = os.path.join(data_packages_path, expected_filename)

        if not os.path.isfile(expected_path):
            logger.info((f'Dataset: {ckan_id} not in DATA.JSON.'
                         f'It was deleted?: {expected_path}'))
            deleted += 1
            result = {
                'action': 'delete',
                'ckan_id': ckan_id,
                'new_data': None,
                'reason': 'It no longer exists in the data.json source'
            }
            row.update({'comparison_results': result})
            yield row
            continue

        datajson_package = Package(expected_path)
        # logger.info(f'Dataset: {ckan_id}
        # found as data package at {expected_path}')

        # TODO analyze this: https://github.com/ckan/ckanext-harvest/blob/master/ckanext/harvest/harvesters/base.py#L229

        # compare dates
        # at data.json: "modified": "2019-06-27 12:41:27",
        # at ckan results: "metadata_modified": "2019-07-02T17:20:58.334748",

        data_json = datajson_package.get_resource('inline')
        data_json_data = data_json.source
        data_json_modified = parse(
            data_json_data['modified'])  # It's a naive date

        ckan_json = row
        ckan_json_modified = parse(ckan_json['metadata_modified'])

        # un-naive datetimes
        if data_json_modified.tzinfo is None:
            data_json_modified = data_json_modified.replace(
                tzinfo=default_tzinfo_for_naives_dates)
            # logger.warning('Modified date in data.json is naive: {}'.format(data_json_data['modified']))
        if ckan_json_modified.tzinfo is None:
            ckan_json_modified = ckan_json_modified.replace(
                tzinfo=default_tzinfo_for_naives_dates)
            # logger.warning('Modified date in CKAN results is naive: {}'.format(ckan_json['metadata_modified']))

        diff_times = data_json_modified - ckan_json_modified

        seconds = diff_times.total_seconds()
        # logger.info(f'Seconds: {seconds} data.json:{data_json_modified} ckan:{ckan_json_modified})')

        # TODO analyze this since we have a Naive date we are not sure
        if abs(seconds) > 86400:  # more than a day
            warning = '' if seconds > 0 else 'Data.json is older than CKAN'
            result = {
                'action': 'update',
                'ckan_id': ckan_id,
                'new_data': data_json_data,
                'reason': f'Changed: ~{seconds} seconds difference. {warning}'
            }
            found_update += 1
        else:
            result = {
                'action': 'ignore',
                'ckan_id': ckan_id,
                'new_data': None,  # do not need this data_json_data
                'reason': 'Changed: ~{seconds} seconds difference'
            }
            found_not_update += 1
        row.update({'comparison_results': result})
        yield row

        # if sample_row is None:
        #     sample_row = row

        # Delete the data.json file
        os.remove(expected_path)

    news = 0
    for name in glob.glob(f'{data_packages_path}/data-json-*.json'):
        news += 1
        package = Package(name)
        data_json = package.get_resource('inline')
        data_json_data = data_json.source

        result = {
            'action': 'create',
            'ckan_id': None,
            'new_data': data_json_data,
            'reason': 'Not found in the CKAN results'
        }

        # there is no real row here

        # row = sample_row.update({'comparison_results': result})
        row = {'comparison_results': result}
        yield row

        # Delete the data.json file
        os.remove(name)

    found = found_not_update + found_update

    stats = f"""Total processed: {total}.
                {no_extras} fail extras.
                {no_identifier_key_found} fail identifier key.
                {deleted} deleted.
                {found} datasets found ({found_update} needs update, {found_not_update} are the same),
                {news} new datasets."""

    logger.info(stats)
Example #15
0
def compare_resources(rows):
    """ read the previous resource (CKAN API results)
        Yield any comparison result
        """

    res_name = rows.res.name if hasattr(rows, 'res') else 'Fake res testing'
    logger.info(f'Rows from resource {res_name}')

    data_packages_path = config.get_data_packages_folder_path()
    default_tzinfo_for_naives_dates = pytz.UTC

    # Calculate minimum statistics
    total = 0

    no_extras = 0
    no_identifier_key_found = 0
    deleted = 0
    found_update = 0
    found_not_update = 0

    sample_row = None
    for row in rows:
        total += 1
        # logger.info(f'Row: {total}')
        # check for identifier
        ckan_id = row['id']
        extras = row.get('extras', False)
        if not extras:
            # TODO learn why.
            logger.error(f'No extras! dataset: {ckan_id}')
            result = {
                'action': 'error',
                'ckan_id': ckan_id,
                'new_data': None,
                'reason': 'The CKAN dataset does not '
                'have the "extras" property'
            }
            row.update({'comparison_results': result})
            yield row
            no_extras += 1
            continue

        identifier = None
        for extra in extras:
            if extra['key'] == 'identifier':
                identifier = extra['value']

        if identifier is None:
            logger.error('No identifier '
                         '(extras[].key.identifier not exists). '
                         'Dataset.id: {}'.format(ckan_id))

            no_identifier_key_found += 1
            result = {
                'action': 'error',
                'ckan_id': ckan_id,
                'new_data': None,
                'reason': 'The CKAN dataset does not have an "identifier"'
            }
            row.update({'comparison_results': result})
            yield row
            continue

        # was parent in the previous harvest
        # if extras.get('collection_metadata', None) is not None:

        encoded_identifier = encode_identifier(identifier)
        expected_filename = f'csw-{encoded_identifier}.json'
        expected_path = os.path.join(data_packages_path, expected_filename)

        if not os.path.isfile(expected_path):
            logger.info((f'Dataset: {ckan_id} not in CSW Source.'
                         f'It was deleted?: {expected_path}'))
            deleted += 1
            result = {
                'action': 'delete',
                'ckan_id': ckan_id,
                'new_data': None,
                'reason': 'It no longer exists in the CSW source'
            }
            row.update({'comparison_results': result})
            yield row
            continue

        # the file (and the identifier) exists

        csw_package = Package(expected_path)
        csw_json = csw_package.get_resource('inline')
        csw_json_data = csw_json.source

        result = {
            'action': 'update',
            'ckan_id': ckan_id,
            'new_data': csw_json_data,
            'reason': f'Changed: ~{seconds} seconds difference. {warning}'
        }
        found_update += 1
        yield row

        # remove so next step not detect it as new
        os.remove(expected_path)

    news = 0
    for name in glob.glob(f'{data_packages_path}/csw-*.json'):
        news += 1
        package = Package(name)
        csw_json = package.get_resource('inline')
        csw_json_data = csw_json.source

        result = {
            'action': 'create',
            'ckan_id': None,
            'new_data': csw_json_data,
            'reason': 'Not found in the CKAN results'
        }

        # there is no real row here

        # row = sample_row.update({'comparison_results': result})
        row = {'comparison_results': result}
        yield row

        # Delete the csw.json file
        os.remove(name)

    found = found_not_update + found_update

    stats = f"""Total processed: {total}.
                {no_extras} fail extras.
                {no_identifier_key_found} fail identifier key.
                {deleted} deleted.
                {found} datasets found ({found_update} needs update, {found_not_update} are the same),
                {news} new datasets."""

    logger.info(stats)
Example #16
0
    def create_package(
            self,
            ckan_package,
            on_duplicated='RAISE',  # if name already exists 'RAISE' 'SKIP' | 'DELETE'
    ):
        """ POST to CKAN API to create a new package/dataset
            ckan_package is just a python dict
            https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create
        """
        url = '{}{}'.format(self.base_url, self.package_create_url)
        headers = self.get_request_headers(include_api_key=True)

        headers['Content-Type'] = 'application/json'
        ckan_package_str = json.dumps(ckan_package)

        logger.info(f'POST {url} headers:{headers} data:{ckan_package}')

        try:
            req = requests.post(url, data=ckan_package_str, headers=headers)
        except Exception as e:
            error = 'ERROR creating [POST] CKAN package: {} [{}]'.format(
                url, e)
            raise

        content = req.content
        try:
            json_content = json.loads(content)
        except Exception as e:
            error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
            logger.error(error)
            raise

        if req.status_code == 409:
            logger.info(f'409 json_content: {json_content}')
            # another posible [error] = {'owner_org': ['Organization does not exist']}

            # Check for duplicates
            if json_content['error'].get(
                    'name', None) == ["That URL is already in use."]:
                logger.error(
                    f'Package Already exists! ACTION: {on_duplicated}')
                if on_duplicated == 'SKIP':
                    return {'success': True}
                elif on_duplicated == 'DELETE':
                    delr = self.delete_package(
                        ckan_package_id_or_name=ckan_package['name'])
                    if not delr['success']:
                        raise Exception('Failed to delete {}'.format(
                            ckan_package['name']))
                    return self.create_package(ckan_package=ckan_package,
                                               on_duplicated='RAISE')
                elif on_duplicated == 'RAISE':
                    error = ('DUPLICATED CKAN package: {}'
                             '\n\t Status code: {}'
                             '\n\t content:{}'
                             '\n\t Dataset {}'.format(url, req.status_code,
                                                      content, ckan_package))
                    logger.error(error)
                    raise Exception(error)

        if req.status_code >= 400:
            error = ('ERROR creating CKAN package: {}'
                     '\n\t Status code: {}'
                     '\n\t content:{}'
                     '\n\t Dataset {}'.format(url, req.status_code, content,
                                              ckan_package))
            logger.error(error)
            raise Exception(error)

        if not json_content['success']:
            error = 'API response failed: {}'.format(
                json_content.get('error', None))
            logger.error(error)

        return json_content
Example #17
0
    def search_packages(
        self,
        rows=1000,
        method='POST',  # POST work in CKAN 2.8, fails in 2.3
        search_params={}):  # datajson for
        """ search packages.
            "rows" is the page size.
            """

        start = 0

        url = '{}{}'.format(self.base_url, self.package_search_url)
        page = 0
        # TODO check for a real paginated version
        while url:
            page += 1

            params = {'start': start, 'rows': rows}
            params.update(search_params)
            logger.info(
                f'Searching packages {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}'
            )

            headers = self.get_request_headers()
            try:
                if method == 'POST':  # depend on CKAN version
                    req = requests.post(url, data=params, headers=headers)
                else:
                    req = requests.get(url, params=params, headers=headers)

            except Exception as e:
                error = 'ERROR Donwloading package list: {} [{}]'.format(
                    url, e)
                raise ValueError(
                    'Failed to get package list at {}'.format(url))

            content = req.content

            if req.status_code >= 400:
                error = ('ERROR searching CKAN package: {}'
                         '\n\t Status code: {}'
                         '\n\t Params: {}'
                         '\n\t content:{}'.format(url, req.status_code, params,
                                                  content))
                logger.error(error)
                raise Exception(error)

            try:
                json_content = json.loads(content)  # check for encoding errors
            except Exception as e:
                error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
                raise ValueError(error)

            if not json_content['success']:
                error = 'API response failed: {}'.format(
                    json_content.get('error', None))
                raise ValueError(error)

            result = json_content['result']
            results = result['results']
            real_results_count = len(results)
            self.total_packages += real_results_count
            logger.info(f'{real_results_count} results')

            if real_results_count == 0:
                url = None
            else:
                start += rows
                self.package_list += results
                logger.debug(f'datasets found: {results}')
                yield (results)
colections_ids = set()
c = 0
urls = []
with_configs = 0
with_config_filters = 0
with_config_defaults = 0

for results in cpa.search_harvest_packages(harvest_type='harvest',
                                           method='GET'
                                           #,source_type='datajson'
                                           ):
    for local_harvest_source in results:

        url = local_harvest_source['url']
        if url in urls:
            logger.error(
                '------------------\n   ALREADY READED\n------------------')
            continue
        else:
            urls.append(url)

        c += 1
        name = local_harvest_source.get('name', 'UNNAMED')
        hspath = config.get_harvest_sources_path(hs_name=name)
        f = open(hspath, 'w')
        f.write(json.dumps(local_harvest_source, indent=2))
        f.close()
        logger.info(f'{hspath} saved')

        # check for config.filters and config.defaults
        config_str = local_harvest_source.get('config', '{}')
        configs = json.loads(config_str)
    def test_create_harvest_source(self):
        logger.info('Creating harvest source')
        cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY)
        cpa.delete_all_harvest_sources(harvest_type='harvest',
                                       source_type='datajson')

        title = 'Energy JSON test {}'.format(random.randint(1, 999999))
        url = 'http://www.energy.gov/data-{}.json'.format(
            random.randint(1, 999999))
        res = cpa.create_harvest_source(
            title=title,
            url=url,
            owner_org_id=CKAN_ORG_ID,
            source_type='datajson',
            notes='Some tests about local harvesting sources creation',
            frequency='WEEKLY')

        self.assertTrue(res['success'])
        harvest_source = res['result']
        logger.info('Created: {}'.format(res['success']))

        # read it
        res = cpa.show_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res['success'])
        self.assertEqual(harvest_source['url'], url)
        self.assertEqual(harvest_source['title'], title)
        self.assertEqual(harvest_source['type'], 'harvest')
        self.assertEqual(harvest_source['source_type'], 'datajson')

        # search for it
        results = cpa.search_harvest_packages(rows=1000,
                                              harvest_type='harvest',
                                              source_type='datajson')

        created_ok = False

        for datasets in results:
            for dataset in datasets:
                # print('FOUND: {}'.format(dataset['name']))
                if dataset['name'] == harvest_source['name']:
                    created_ok = True
                    logger.info('Found!')
                else:
                    logger.info('Other harvest source: {}'.format(
                        dataset['name']))

        assert created_ok == True

        # create a dataset with this harvest_soure_id
        dataset_title = 'Dataset number {}'.format(random.randint(1, 999999))
        dataset_name = slugify(dataset_title)
        tags = [{'name': 'tag81'}, {'name': 'tag82'}]

        randval = random.randint(1, 999)
        extras = [
            {
                'key': 'harvest_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_source_title',
                'value': harvest_source['title']
            },
            # {'key': 'harvest_object_id', 'value': harvest_source['id']},  # ? not sure
            {
                'key': 'harvest_ng_source_id',
                'value': harvest_source['id']
            },
            {
                'key': 'harvest_ng_source_title',
                'value': harvest_source['title']
            },
            {
                'key': 'try_a_extra',
                'value': randval
            }
        ]

        package = {
            'name': dataset_name,
            'title': dataset_title,
            'owner_org': CKAN_ORG_ID,
            'tags': tags,
            'extras': extras
        }
        res2 = cpa.create_package(ckan_package=package)
        self.assertTrue(res2['success'])
        logger.info('Package with harvest source: {}'.format(res2['success']))

        # read full dataset
        res3 = cpa.show_package(ckan_package_id_or_name=dataset_name)
        self.assertTrue(res3['success'])
        ckan_dataset = res3['result']
        logger.info(
            'Package with harvest source readed: {}'.format(ckan_dataset))

        assert 'extras' in ckan_dataset
        assert [str(randval)] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'try_a_extra'
        ]
        # my custom ID (not connected to a real harvest ID)
        assert [harvest_source['id']] == [
            extra['value'] for extra in ckan_dataset['extras']
            if extra['key'] == 'harvest_ng_source_id'
        ]

        # check if this package is related to harvest source
        total_datasets_in_source = 0
        datasets_from_source = cpa.search_harvest_packages(
            harvest_source_id=harvest_source['id'])
        connected_ok = False
        for datasets in datasets_from_source:
            for dataset in datasets:
                total_datasets_in_source += 1
                if dataset['name'] == dataset_name:
                    connected_ok = True
                    logger.info('Found!')
                else:
                    # we just expect one dataset
                    error = '{} != {} ------ {}'.format(
                        dataset['name'], dataset_name, dataset)
                    logger.error(error)
                    assert error == False

        assert connected_ok == True
        assert total_datasets_in_source == 1
        logger.info(
            f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}'
        )

        # this fails, harvest process is more complex that just add an extra
        # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id']

        # delete both
        logger.info('Delete CKAN package: {}'.format(ckan_dataset['id']))
        res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id'])
        self.assertTrue(res4['success'])

        logger.info('Delete Harvest source: {}'.format(harvest_source['id']))
        res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id'])
        self.assertTrue(res5['success'])
Example #20
0
    def search_harvest_packages(
            self,
            rows=1000,
            method='POST',  # POST work in CKAN 2.8, fails in 2.3
            harvest_source_id=None,  # just one harvest source
            harvest_type=None,  # harvest for harvest sources
            source_type=None):  # datajson for
        """ search harvested packages or harvest sources
            "rows" is the page size.
            You could search for an specific harvest_source_id """

        start = 0
        sort = "metadata_modified desc"

        url = '{}{}'.format(self.base_url, self.package_search_url)
        page = 0
        # TODO check for a real paginated version
        while url:
            page += 1

            params = {'start': start, 'rows': rows}  # , 'sort': sort}
            if harvest_source_id is not None:
                # you can't search by any extras
                # https://github.com/ckan/ckan/blob/30ca7aae2f2aca6a19a2e6ed29148f8428e25c86/ckan/logic/action/get.py#L1852
                # params['ext_harvest_source_id'] = harvest_source_id
                # params['ext_harvest_ng_source_id'] = harvest_source_id
                # params['extras'] = {'ext_harvest_ng_source_id': harvest_source_id}
                # params['q'] = f'harvest_source_id:{harvest_source_id}'

                # ---------------
                # this must work
                # ---------------
                # https://github.com/ckan/ckanext-harvest/blob/3a72337f1e619bf9ea3221037ca86615ec22ae2f/ckanext/harvest/helpers.py#L38
                # params['fq'] = f'+harvest_source_id:"{harvest_source_id}"'
                # but is not working. For some reason exta harvest_source_id doesn't exists

                # our new extra is working
                params['fq'] = f'+harvest_ng_source_id:"{harvest_source_id}"'

            elif harvest_type is not None:
                # at my local instance I need this.
                # I not sure why, in another public instances is not needed
                params['fq'] = f'+dataset_type:{harvest_type}'
                if source_type is not None:
                    params[
                        'q'] = f'(type:{harvest_type} source_type:{source_type})'
                else:
                    params['q'] = f'(type:{harvest_type})'

            logger.info(
                f'Searching {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}'
            )

            headers = self.get_request_headers()
            try:
                if method == 'POST':  # depend on CKAN version
                    req = requests.post(url, data=params, headers=headers)
                else:
                    req = requests.get(url, params=params, headers=headers)

            except Exception as e:
                error = 'ERROR Donwloading package list: {} [{}]'.format(
                    url, e)
                raise ValueError(
                    'Failed to get package list at {}'.format(url))

            content = req.content

            if req.status_code >= 400:
                error = ('ERROR searching CKAN package: {}'
                         '\n\t Status code: {}'
                         '\n\t Params: {}'
                         '\n\t content:{}'.format(url, req.status_code, params,
                                                  content))
                logger.error(error)
                raise Exception(error)

            try:
                json_content = json.loads(content)  # check for encoding errors
            except Exception as e:
                error = 'ERROR parsing JSON data: {} [{}]'.format(content, e)
                raise ValueError(error)

            if not json_content['success']:
                error = 'API response failed: {}'.format(
                    json_content.get('error', None))
                raise ValueError(error)

            result = json_content['result']
            count_results = result['count']
            sort_results = result['sort']
            facet_results = result['facets']
            results = result['results']
            real_results_count = len(results)
            self.total_packages += real_results_count
            logger.info(f'{real_results_count} results')

            if real_results_count == 0:
                url = None
            else:
                start += rows
                self.package_list += results
                logger.debug(f'datasets found: {results}')
                yield (results)