def get_record(context, repo, ckan_url, ckan_id, ckan_info): query = ckan_url + "harvest/object/%s" url = query % ckan_info["harvest_object_id"] response = requests.get(url) if ckan_info["source"] == "arcgis": return try: xml = etree.parse(io.BytesIO(response.content)) except Exception as err: log.error("Could not pass xml doc from %s, Error: %s" % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception as err: log.error("Could not extract metadata from %s, Error: %s" % (ckan_id, err)) return if not record.identifier: record.identifier = ckan_id record.ckan_id = ckan_id record.ckan_modified = ckan_info["metadata_modified"] return record
def _parse_and_upsert_metadata(self, md: str): logger.debug('Parsing XML') try: xml = etree.fromstring(md) except Exception as err: logger.error(f'XML parsing failed: {err}') raise logger.debug('Processing metadata') try: record = metadata.parse_record(self.context, xml, self.repo)[0] record.xml = record.xml.decode() logger.info(f"identifier: {record.identifier}") except Exception as err: logger.error(f'Metadata parsing failed: {err}') raise if self.repo.query_ids([record.identifier]): logger.info('Updating record') try: self.repo.update(record) logger.info('record updated') except Exception as err: logger.error(f'record update failed: {err}') raise else: logger.info('Inserting record') try: self.repo.insert(record, 'local', util.get_today_and_now()) logger.info('record inserted') except Exception as err: logger.error(f'record insertion failed: {err}') raise return
def load_records(repo, parsed_xml, context): """Load metadata records from directory of files to database""" xml_records = parsed_xml.xpath('//csw:Insert', namespaces=context.namespaces)[0] parsed_records = xml_records.xpath('child::*') parsed_records = [ metadata.parse_record(context, f, repo)[0] for f in parsed_records ] [repo.insert(r, 'local', r.insert_date) for r in parsed_records]
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" repo = repository.Repository(database, context, table=table) file_list = [] if os.path.isfile(xml_dirpath): file_list.append(xml_dirpath) elif recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile, context.parser) except Exception as err: LOGGER.warn('XML document is not well-formed: %s', str(err)) continue record = metadata.parse_record(context, exml, repo) for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) LOGGER.info('Inserted') except RuntimeError as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated') else: LOGGER.warn('ERROR: not inserted %s', err)
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" repo = repository.Repository(database, context, table=table) file_list = [] if os.path.isfile(xml_dirpath): file_list.append(xml_dirpath) elif recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith(".xml"): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, "*.xml")): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info("Processing file %s (%d of %d)", recfile, counter, total) # read document try: exml = etree.parse(recfile, context.parser) except Exception as err: LOGGER.warn("XML document is not well-formed: %s", str(err)) continue record = metadata.parse_record(context, exml, repo) for rec in record: LOGGER.info( "Inserting %s %s into database %s, table %s ....", rec.typename, rec.identifier, database, table ) # TODO: do this as CSW Harvest try: repo.insert(rec, "local", util.get_today_and_now()) LOGGER.info("Inserted") except RuntimeError as err: if force_update: LOGGER.info("Record exists. Updating.") repo.update(rec) LOGGER.info("Updated") else: LOGGER.warn("ERROR: not inserted %s", err)
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" from sqlalchemy.exc import DBAPIError repo = repository.Repository(database, context, table=table) file_list = [] loaded_files = set() if os.path.isfile(xml_dirpath): file_list.append(xml_dirpath) elif recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile, context.parser) except etree.XMLSyntaxError as err: LOGGER.error('XML document "%s" is not well-formed', recfile, exc_info=True) continue except Exception as err: LOGGER.exception('XML document "%s" is not well-formed', recfile) continue try: record = metadata.parse_record(context, exml, repo) except Exception as err: LOGGER.exception('Could not parse "%s" as an XML record', recfile) continue for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) loaded_files.add(recfile) LOGGER.info('Inserted %s', recfile) except Exception as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated %s', recfile) loaded_files.add(recfile) else: if isinstance(err, DBAPIError) and err.args: # Pull a decent database error message and not the full SQL that was run # since INSERT SQL statements are rather large. LOGGER.error('ERROR: %s not inserted: %s', recfile, err.args[0], exc_info=True) else: LOGGER.error('ERROR: %s not inserted: %s', recfile, err, exc_info=True) return tuple(loaded_files)
def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False): """Load metadata records from directory of files to database""" from sqlalchemy.exc import DBAPIError repo = repository.Repository(database, context, table=table) file_list = [] loaded_files = set() if os.path.isfile(xml_dirpath): file_list.append(xml_dirpath) elif recursive: for root, dirs, files in os.walk(xml_dirpath): for mfile in files: if mfile.endswith('.xml'): file_list.append(os.path.join(root, mfile)) else: for rec in glob(os.path.join(xml_dirpath, '*.xml')): file_list.append(rec) total = len(file_list) counter = 0 for recfile in sorted(file_list): counter += 1 LOGGER.info('Processing file %s (%d of %d)', recfile, counter, total) # read document try: exml = etree.parse(recfile, context.parser) except etree.XMLSyntaxError as err: LOGGER.error('XML document "%s" is not well-formed', recfile) continue except Exception as err: LOGGER.exception('XML document "%s" is not well-formed', recfile) continue try: record = metadata.parse_record(context, exml, repo) except Exception as err: LOGGER.exception('Could not parse "%s" as an XML record', recfile) continue for rec in record: LOGGER.info('Inserting %s %s into database %s, table %s ....', rec.typename, rec.identifier, database, table) # TODO: do this as CSW Harvest try: repo.insert(rec, 'local', util.get_today_and_now()) loaded_files.add(recfile) LOGGER.info('Inserted %s', recfile) except Exception as err: if force_update: LOGGER.info('Record exists. Updating.') repo.update(rec) LOGGER.info('Updated %s', recfile) loaded_files.add(recfile) else: if isinstance(err, DBAPIError) and err.args: # Pull a decent database error message and not the full SQL that was run # since INSERT SQL statements are rather large. LOGGER.error('ERROR: %s not inserted: %s', recfile, err.args[0]) else: LOGGER.error('ERROR: %s not inserted: %s', recfile, err) return tuple(loaded_files)
def get_record(context, repo, ckan_url, ckan_id, ckan_info): query = ckan_url + 'harvest/object/%s' url = query % ckan_info['harvest_object_id'] response = requests.get(url) if ckan_info['source'] == 'arcgis': return try: xml = etree.parse(io.BytesIO(response.content)) except Exception, err: log.error('Could not pass xml doc from %s, Error: %s' % (ckan_id, err)) return try: record = metadata.parse_record(context, xml, repo)[0] except Exception, err: log.error('Could not extract metadata from %s, Error: %s' % (ckan_id, err)) return if not record.identifier: record.identifier = ckan_id record.ckan_id = ckan_id record.ckan_modified = ckan_info['metadata_modified'] return record usage = ''' Manages the CKAN-pycsw integration