Example #1
0
def get_current_ckan_resources_from_api(harvest_source_id):
    results_json_path = config.get_ckan_results_cache_path()
    logger.info(f'Extracting from harvest source id: {harvest_source_id}')
    cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL)
    resources = 0

    page = 0
    for datasets in cpa.search_harvest_packages(
            harvest_source_id=harvest_source_id):
        # getting resources in pages of packages
        page += 1
        logger.info('PAGE {} from harvest source id: {}'.format(
            page, harvest_source_id))
        for dataset in datasets:
            pkg_resources = len(dataset['resources'])
            resources += pkg_resources
            yield (dataset)

            # we don't need to save this
            # save_dict_as_data_packages(data=package, path=config.get_data_packages_folder_path(),
            #                           prefix='ckan-result',
            #                           identifier_field='id')

    logger.info('{} total resources in harvest source id: {}'.format(
        resources, harvest_source_id))
    cpa.save_packages_list(path=results_json_path)
packages_folder_path = os.path.join(local_folder, 'datapackages')
if not os.path.isdir(packages_folder_path):
    os.makedirs(packages_folder_path)

api_results_path = os.path.join(local_folder, 'api_results.json')
# api_errors_path = os.path.join(local_folder, 'api_errors.json')
# duplicates_path = os.path.join(local_folder, 'api_duplicates.json')

# ----------------------------------------------------
# Get data.json if not here (or force)
# ----------------------------------------------------
if not os.path.isfile(api_results_path) or args.force_download:
    logger.info('Downloading')
    cpa = CKANPortalAPI(base_url=args.ckan_base_url)
    cpa.get_all_packages(harvest_source_id=args.harvest_source_id)
    cpa.save_packages_list(path=api_results_path)
else:
    logger.info(f'Using data.json prevously downloaded: {api_results_path}')
    cpa = CKANPortalAPI()
    cpa.read_local_packages(path=api_results_path)

packages = cpa.package_list
total_datasets = len(packages)
total_resources = cpa.count_resources()

logger.info('cleaning datasets')
duplicates = cpa.remove_duplicated_identifiers()
total_duplicates = len(duplicates)

logger.info(
    f'Readed {total_datasets} datasets including {total_resources} resources. {total_duplicates} duplicated identifiers removed'