def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" repo = repository.Repository(database, context, table=table) file_list = [] if recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile) except Exception as err: LOGGER.warn('XML document is not well-formed: %s', str(err)) continue record = metadata.parse_record(context, exml, repo) for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) LOGGER.info('Inserted') except RuntimeError as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated') else: LOGGER.warn('ERROR: not inserted %s', err)
def load_records(database, table, xml_dirpath, recursive=False): ''' Load metadata records from directory of files to database ''' REPO = repository.Repository(database, CONTEXT, table=table) file_list = [] if recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for r in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(r) total = len(file_list) counter = 0 for r in file_list: counter += 1 print 'Processing file %s (%d of %d)' % (r, counter, total) # read document try: e = etree.parse(r) except Exception, err: print 'XML document is not well-formed: %s' % str(err) continue record = metadata.parse_record(CONTEXT, e, REPO) for rec in record: print 'Inserting %s %s into database %s, table %s ....' % \ (rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: REPO.insert(rec, 'local', util.get_today_and_now()) print 'Inserted' except Exception, err: print 'ERROR: not inserted %s' % err
def get_record(context, repo, ckan_url, ckan_id, ckan_info): query = ckan_url + 'harvest/object/%s' url = query % ckan_info['harvest_object_id'] response = requests.get(url) if ckan_info['source'] == 'arcgis': return try: xml = etree.parse(io.BytesIO(response.content)) except Exception, err: log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception, err: log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) return if not record.identifier: record.identifier = ckan_id record.ckan_id = ckan_id record.ckan_modified = ckan_info['metadata_modified'] return record usage = ''' Manages the CKAN-pycsw integration
def get_record(context, repo, ckan_url, ckan_id, ckan_info): query = ckan_url + 'harvest/object/%s' url = query % ckan_info['harvest_object_id'] response = requests.get(url) if ckan_info['source'] == 'arcgis': return try: xml = etree.parse(io.BytesIO(response.content)) except Exception, err: log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception, err: log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) return if not record.identifier: record.identifier = ckan_id record.ckan_id = ckan_id record.organization = ckan_info['organization'] record.ckan_modified = ckan_info['metadata_modified'] return record usage = '''
def load(pycsw_config, ckan_url): """ Take ISO 19139 XML data from a CKAN package and insert it into the PyCSW database. This function runs selectively, meaning that it will only return packages for resources in the CKAN datastore database. It builds a URL for querying the datastore, returns a list of the datastore resource IDs, builds URLs for querying the resources, runs a regular expression to determine what the package ID of a datastored resource is, builds a URL to scrape each package's ISO XML record and then inserts the XML as a record in the PyCSW database. @param pycsw_config: pycsw.cfg file that should have been configured upon installing PyCSW. Should contain auth information about the database to connect to. @param ckan_url: e.g http://127.0.0.1:5000 """ def parse_datastore(ckan_url): """ Scrape and return every resource ID in the datastore database, accessing the information through CKAN's REST API. @param ckan_url: e.g. http://127.0.0.1:5000 @return: a list of datastored resource object IDs """ api_query = 'api/3/action/datastore_search?resource_id=_table_metadata' ignore_names = ['_table_metadata', 'geography_columns', 'geometry_columns', 'spatial_ref_sys'] url = ckan_url + api_query response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError, 'Wrong API response: %s' % listing results = listing['result']['records'] resource_names = [] # Should use a list/dict comprehension here for result in results: if not result['name'] in ignore_names: resource_names.append(result['name']) return resource_names def parse_resource(resource_id, ckan_url): """ CKAN's search API doesn't allow querying packages by their resources. Thankfully, each resource is returned with a URL which contains the package id between the paths "dataset" and "resource", (at least for datastore items) so we can use a RegEx to figure out what the package of a resource is. This is not an ideal solution, but it's the cleanest way to solve the problem until the CKAN team decides to organize their data in a less authoritative manner. @param resource_id: the id of a datastored resource object @param ckan_url: http://127.0.0.1:5000 """ api_query = 'api/3/action/resource_show?id=%s' % resource_id url = ckan_url + api_query response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError, 'Wrong API response: %s' % listing package_url = listing['result']['url'] # Here's that RegEx. Ugh. package_id = re.findall('dataset/(.*?)/resource', package_url, re.DOTALL) return package_id[0] def get_record(context, repo, ckan_url, ckan_id, ckan_info): """ Hit the CKAN REST API for an ISO 19139 XML representation of a package with data uploaded into the datastore. @param context: Vanilla-CKAN auth noise @param repo: PyCSW repository (database) @param ckan_url: e.g. http://127.0.0.1:5000 @param ckan_id: Package ID @param ckan_info: Package data @return: ISO 19139 XML data """ query = ckan_url + 'package_iso/object/%s' url = query % ckan_info['id'] response = requests.get(url) try: xml = etree.parse(io.BytesIO(response.content)) except Exception, err: log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception, err: log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) return
def load(pycsw_config, ckan_url): """ Take ISO 19139 XML data from a CKAN package and insert it into the PyCSW database. This function runs selectively, meaning that it will only return packages for resources in the CKAN datastore database. It builds a URL for querying the datastore, returns a list of the datastore resource IDs, builds URLs for querying the resources, runs a regular expression to determine what the package ID of a datastored resource is, builds a URL to scrape each package's ISO XML record and then inserts the XML as a record in the PyCSW database. @param pycsw_config: pycsw.cfg file that should have been configured upon installing PyCSW. Should contain auth information about the database to connect to. @param ckan_url: e.g http://127.0.0.1:5000 """ def parse_datastore(ckan_url): """ Scrape and return every resource ID in the datastore database, accessing the information through CKAN's REST API. @param ckan_url: e.g. http://127.0.0.1:5000 @return: a list of datastored resource object IDs """ api_query = 'api/3/action/datastore_search?resource_id=_table_metadata' ignore_names = [ '_table_metadata', 'geography_columns', 'geometry_columns', 'spatial_ref_sys' ] url = ckan_url + api_query response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError, 'Wrong API response: %s' % listing results = listing['result']['records'] resource_names = [] # Should use a list/dict comprehension here for result in results: if not result['name'] in ignore_names: resource_names.append(result['name']) return resource_names def parse_resource(resource_id, ckan_url): """ CKAN's search API doesn't allow querying packages by their resources. Thankfully, each resource is returned with a URL which contains the package id between the paths "dataset" and "resource", (at least for datastore items) so we can use a RegEx to figure out what the package of a resource is. This is not an ideal solution, but it's the cleanest way to solve the problem until the CKAN team decides to organize their data in a less authoritative manner. @param resource_id: the id of a datastored resource object @param ckan_url: http://127.0.0.1:5000 """ api_query = 'api/3/action/resource_show?id=%s' % resource_id url = ckan_url + api_query response = requests.get(url) listing = response.json() if not isinstance(listing, dict): raise RuntimeError, 'Wrong API response: %s' % listing # skip Authorization Error, most likely due to deleted packages. if 'error' in listing: if ("Not Found Error" == listing['error']['__type']) or ( "Authorization Error" == listing['error']['__type']): return None log.info('listing is %r' % listing) if listing['result']: package_url = listing['result']['url'] else: return None # Here's that RegEx. Ugh. package_id = re.findall('dataset/(.*?)/resource', package_url, re.DOTALL) if package_id: return package_id[0] else: return None def get_record(context, repo, ckan_url, ckan_id, ckan_info): """ Hit the CKAN REST API for an ISO 19139 XML representation of a package with data uploaded into the datastore. @param context: Vanilla-CKAN auth noise @param repo: PyCSW repository (database) @param ckan_url: e.g. http://127.0.0.1:5000 @param ckan_id: Package ID @param ckan_info: Package data @return: ISO 19139 XML data """ query = ckan_url + 'package_iso/object/%s' url = query % ckan_info['id'] response = requests.get(url) try: xml = etree.parse(io.BytesIO(response.content)) except Exception, err: log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception, err: log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) return