def get_current_ckan_resources_from_api(harvest_source_id): results_json_path = config.get_ckan_results_cache_path() logger.info(f'Extracting from harvest source id: {harvest_source_id}') cpa = CKANPortalAPI(base_url=config.CKAN_CATALOG_URL) resources = 0 page = 0 for datasets in cpa.search_harvest_packages( harvest_source_id=harvest_source_id): # getting resources in pages of packages page += 1 logger.info('PAGE {} from harvest source id: {}'.format( page, harvest_source_id)) for dataset in datasets: pkg_resources = len(dataset['resources']) resources += pkg_resources yield (dataset) # we don't need to save this # save_dict_as_data_packages(data=package, path=config.get_data_packages_folder_path(), # prefix='ckan-result', # identifier_field='id') logger.info('{} total resources in harvest source id: {}'.format( resources, harvest_source_id)) cpa.save_packages_list(path=results_json_path)
def test_load_from_url(self): cpa = CKANPortalAPI(base_url=CKAN_BASE_URL) resources = 0 page = 0 for packages in cpa.search_harvest_packages( harvest_source_id=HARVEST_SOURCE_ID): page += 1 print(f'API packages search page {page}') self.assertGreater(cpa.total_packages, 0) # has resources in the first page break # do not need more
def get_current_ckan_resources_from_api(harvest_source_id=None): logger.info('Extracting from harvest source id: {}'.format(harvest_source_id)) cpa = CKANPortalAPI() resources = 0 page = 0 for packages in cpa.search_harvest_packages(harvest_source_id=harvest_source_id): # getting resources in pages of packages page += 1 logger.info('PAGE {} from harvest source id: {}'.format(page, harvest_source_id)) for package in packages: pkg_resources = len(package['resources']) resources += pkg_resources yield(package) logger.info('{} total resources'.format(resources))
def test_create_harvest_source(self): logger.info('Creating harvest source') cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) cpa.delete_all_harvest_sources(harvest_type='harvest', source_type='datajson') title = 'Energy JSON test {}'.format(random.randint(1, 999999)) url = 'http://www.energy.gov/data-{}.json'.format( random.randint(1, 999999)) res = cpa.create_harvest_source( title=title, url=url, owner_org_id=CKAN_ORG_ID, source_type='datajson', notes='Some tests about local harvesting sources creation', frequency='WEEKLY') self.assertTrue(res['success']) harvest_source = res['result'] logger.info('Created: {}'.format(res['success'])) # read it res = cpa.show_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res['success']) self.assertEqual(harvest_source['url'], url) self.assertEqual(harvest_source['title'], title) self.assertEqual(harvest_source['type'], 'harvest') self.assertEqual(harvest_source['source_type'], 'datajson') # search for it results = cpa.search_harvest_packages(rows=1000, harvest_type='harvest', source_type='datajson') created_ok = False for datasets in results: for dataset in datasets: # print('FOUND: {}'.format(dataset['name'])) if dataset['name'] == harvest_source['name']: created_ok = True logger.info('Found!') else: logger.info('Other harvest source: {}'.format( dataset['name'])) assert created_ok == True # create a dataset with this harvest_soure_id dataset_title = 'Dataset number {}'.format(random.randint(1, 999999)) dataset_name = slugify(dataset_title) tags = [{'name': 'tag81'}, {'name': 'tag82'}] randval = random.randint(1, 999) extras = [ { 'key': 'harvest_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_source_title', 'value': harvest_source['title'] }, # {'key': 'harvest_object_id', 'value': harvest_source['id']}, # ? not sure { 'key': 'harvest_ng_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_ng_source_title', 'value': harvest_source['title'] }, { 'key': 'try_a_extra', 'value': randval } ] package = { 'name': dataset_name, 'title': dataset_title, 'owner_org': CKAN_ORG_ID, 'tags': tags, 'extras': extras } res2 = cpa.create_package(ckan_package=package) self.assertTrue(res2['success']) logger.info('Package with harvest source: {}'.format(res2['success'])) # read full dataset res3 = cpa.show_package(ckan_package_id_or_name=dataset_name) self.assertTrue(res3['success']) ckan_dataset = res3['result'] logger.info( 'Package with harvest source readed: {}'.format(ckan_dataset)) assert 'extras' in ckan_dataset assert [str(randval)] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'try_a_extra' ] # my custom ID (not connected to a real harvest ID) assert [harvest_source['id']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_ng_source_id' ] # check if this package is related to harvest source total_datasets_in_source = 0 datasets_from_source = cpa.search_harvest_packages( harvest_source_id=harvest_source['id']) connected_ok = False for datasets in datasets_from_source: for dataset in datasets: total_datasets_in_source += 1 if dataset['name'] == dataset_name: connected_ok = True logger.info('Found!') else: # we just expect one dataset error = '{} != {} ------ {}'.format( dataset['name'], dataset_name, dataset) logger.error(error) assert error == False assert connected_ok == True assert total_datasets_in_source == 1 logger.info( f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}' ) # this fails, harvest process is more complex that just add an extra # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id'] # delete both logger.info('Delete CKAN package: {}'.format(ckan_dataset['id'])) res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id']) self.assertTrue(res4['success']) logger.info('Delete Harvest source: {}'.format(harvest_source['id'])) res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res5['success'])
'schema_version', 'total_dataset', 'total_resources', 'dataset_types', 'resource_types' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() results = [] colections_ids = set() c = 0 urls = [] with_configs = 0 with_config_filters = 0 with_config_defaults = 0 for results in cpa.search_harvest_packages(harvest_type='harvest', method='GET' #,source_type='datajson' ): for local_harvest_source in results: url = local_harvest_source['url'] if url in urls: logger.error( '------------------\n ALREADY READED\n------------------') continue else: urls.append(url) c += 1 name = local_harvest_source.get('name', 'UNNAMED') hspath = config.get_harvest_sources_path(hs_name=name) f = open(hspath, 'w')
parser.add_argument("--source_type", type=str, default='datajson', help="Tipe of harvest source: datajson|csw|waf etc") parser.add_argument("--method", type=str, default='GET', help="POST fails on CKAN 2.3, now is working") args = parser.parse_args() cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) total_sources = cpa.import_harvest_sources(catalog_url=args.import_from_url, method=args.method, on_duplicated='DELETE', harvest_type=args.harvest_type, source_type=args.source_type, delete_local_harvest_sources=True) # search total_searched = 0 for harvest_sources in cpa.search_harvest_packages( method='POST', harvest_type=args.harvest_type, source_type=args.source_type): for harvest_source in harvest_sources: total_searched += 1 assert total_sources == total_searched
templated_harvest_command = """ source {{ params.env_path }}/bin/activate cd {{ params.app_path }} python harvest.py \ --name {{ params.name }} \ --url {{ params.data_json_url }} \ --harvest_source_id {{ params.harvest_source_id }} \ --ckan_owner_org_id {{ params.ckan_org_id }} \ --catalog_url {{ params.catalog_url }} \ --ckan_api_key {{ params.ckan_api_key }} \ --limit_dataset 10 # limit for test, remove for production """ results = cpa.search_harvest_packages(rows=1000, harvest_type='harvest', source_type='datajson') for datasets in results: for harvest_source in datasets: frequency = harvest_source.get('frequency', 'MONTHLY').upper() if frequency not in valid_frequencies: raise Exception(f'Unknown frequency: {frequency}') url = harvest_source['url'] if url in urls: # avoid duplicates continue urls.append(url) organization = harvest_source['organization'] name = harvest_source['name']