def resolve(ctx, igsn_str, accept, url_only, show_steps, use_n2t): """ Show results of resolving an IGSN Args: ctx: The click context passed in from main igsn_str: The IGSN string accept: optional accept header value Returns: outputs response information to stdout Examples:: $ igsn resolve 10273/847000106 https://app.geosamples.org/webservices/display.php?igsn=847000106 """ L = getLogger() if igsn_str is None: L.error("IGSN value is required") return 1 headers = None if not accept is None: headers = { "Accept": accept, } if use_n2t: identifier = igsn_str igsn_val = igsn_lib.normalize(igsn_str) if igsn_val is not None: identifier = f"IGSN:{igsn_val}" responses = igsn_lib.resolveN2T(identifier, headers=headers) else: igsn_val = igsn_lib.normalize(igsn_str) L.info("Normalized IGSN = %s", igsn_val) if igsn_val is None: L.error("Provided identifier not recogized as an IGSN") return 1 responses = igsn_lib.resolve(igsn_val, include_body=not url_only, headers=headers) # Trim space and make upper case if show_steps: nsteps = len(responses) print("History:") cntr = 1 for r in responses: print(f"Step {cntr}/{nsteps}:") dumpResponse(r, indent=" ") cntr += 1 #print(f"Step {cntr}/{nsteps}:") #dumpResponse(response, indent=" ") if url_only: print(f"{responses[-1].url}") return 0 dumpResponseBody(responses[-1]) return 0
async def _loadSesarEntries(session, max_count, start_from=None): L = getLogger() futures = [] working = {} ids = isb_lib.sesar_adapter.SESARIdentifiersSitemap( max_entries=countThings(session) + max_count, date_start=start_from ) total_requested = 0 total_completed = 0 more_work = True num_prepared = BACKLOG_SIZE # Number of jobs to prepare for execution with concurrent.futures.ThreadPoolExecutor( max_workers=CONCURRENT_DOWNLOADS ) as executor: while more_work: # populate the futures list with work until the list is full # or there is no more work to get. while ( len(futures) < BACKLOG_SIZE and total_requested < max_count and num_prepared > 0 ): try: _id = next(ids) igsn = igsn_lib.normalize(_id[0]) existing_thing = sqlmodel_database.get_thing_with_id(session, isb_lib.sesar_adapter.fullIgsn(igsn)) if existing_thing is not None: logging.info("Already have %s at %s", igsn, existing_thing) future = executor.submit(wrapLoadThing, igsn, _id[1], existing_thing) else: future = executor.submit(wrapLoadThing, igsn, _id[1]) futures.append(future) working[igsn] = 0 total_requested += 1 except StopIteration as e: L.info("Reached end of identifier iteration.") num_prepared = 0 if total_requested >= max_count: num_prepared = 0 L.debug("%s", working) try: for fut in concurrent.futures.as_completed(futures, timeout=1): igsn, tc, _thing = fut.result() futures.remove(fut) if not _thing is None: try: session.add(_thing) session.commit() except sqlalchemy.exc.IntegrityError as e: session.rollback() logging.error("Item already exists: %s", _id[0]) # for _rel in _related: # try: # session.add(_rel) # session.commit() # except sqlalchemy.exc.IntegrityError as e: # L.debug(e) working.pop(igsn) total_completed += 1 else: if working.get(igsn, 0) < 3: if not igsn in working: working[igsn] = 1 else: working[igsn] += 1 L.info( "Failed to retrieve %s. Retry = %s", igsn, working[igsn] ) future = executor.submit(wrapLoadThing, igsn, tc) futures.append(future) else: L.error("Too many retries on %s", igsn) working.pop(igsn) except concurrent.futures.TimeoutError: # L.info("No futures to process") pass if len(futures) == 0 and num_prepared == 0: more_work = False if total_completed >= max_count: more_work = False L.info( "requested, completed, current = %s, %s, %s", total_requested, total_completed, len(futures), )
def oaiRecordToDict(xml_string): ''' Converts an OAI-PMH IGSN metadata record to a dict The IGSN schema is at https://doidb.wdc-terra.org/igsn/schemas/igsn.org/schema/1.0/igsn.xsd Times are returned as timezone aware python datetime, TZ=UTC. Args: raw_record: OAI-PMH record XML in IGSN format Returns: dict or None on failure Example: .. jupyter-execute:: import pprint import igsn_lib.oai xml = """<?xml version="1.0"?> <record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <header> <identifier>oai:registry.igsn.org:6940929</identifier> <datestamp>2019-10-15T06:00:10Z</datestamp> <setSpec>IEDA</setSpec> <setSpec>IEDA.SESAR</setSpec> </header> <metadata> <sample xmlns="http://igsn.org/schema/kernel-v.1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://igsn.org/schema/kernel-v.1.0 http://doidb.wdc-terra.org/igsn/schemas/igsn.org/schema/1.0/igsn.xsd"> <sampleNumber identifierType="igsn">10273/BSU0005JF</sampleNumber> <registrant> <registrantName>IEDA</registrantName> </registrant> <log> <logElement event="submitted" timeStamp="2019-10-15T04:00:09Z"/> </log> </sample> </metadata> </record> """ data = igsn_lib.oai.oaiRecordToDict(xml) pprint.pprint(data, indent=2) ''' _L = _getLogger() data = { "igsn_id": None, # Value of the IGSN identifier "oai_id": None, # Internal OAI-PMH identifier of this record "registrant": None, # registrant name "oai_time": None, # time stamp on the OAI record "igsn_time": None, # submitted or registered time in the log "set_spec": [], # list of setSpec entries for record "log": [], # list of log entries "related": [], # list of related identifiers "_source": {}, } try: data["_source"] = xmltodict.parse(xml_string, process_namespaces=True, namespaces=IGSN_OAI_NAMESPACES) except Exception as e: _L.error(e) return None # _L.debug(json.dumps(data["_source"], indent=2)) data["oai_id"] = data["_source"]["oai:record"]["oai:header"][ "oai:identifier"] # Always store time in UTC data["oai_time"] = dateparser.parse( data["_source"]["oai:record"]["oai:header"]["oai:datestamp"], settings={"TIMEZONE": "+0000"}, ) _sample = data["_source"]["oai:record"]["oai:metadata"]["igsn:sample"] igsn_id = _sample["igsn:sampleNumber"]["#text"] data["igsn_id"] = igsn_lib.normalize(igsn_id) data["registrant"] = _sample["igsn:registrant"]["igsn:registrantName"] data["set_spec"] = data["_source"]["oai:record"]["oai:header"][ "oai:setSpec"] # log 'events': # https://doidb.wdc-terra.org//igsn/schemas/igsn.org/schema/1.0/include/igsn-eventType-v1.0.xsd igsn_log = _sample["igsn:log"]["igsn:logElement"] if isinstance(igsn_log, dict): igsn_log = [ igsn_log, ] data["log"] = [] igsn_time = None for _log in igsn_log: _event = _log["@event"].lower().strip() _time = dateparser.parse( _log["@timeStamp"], settings={ "TIMEZONE": "+0000", "RETURN_AS_TIMEZONE_AWARE": True }, ) data["log"].append({ "event": _event, "time": _time.strftime(igsn_lib.time.JSON_TIME_FORMAT) }) if _event == "submitted": igsn_time = _time if _event == "registered": # Use registered time if submitted not available if igsn_time is None: igsn_time = _time if _event == "updated": # Fall back to updated time if igsn_time is None: igsn_time = _time data["igsn_time"] = igsn_time _related_ids = [] try: _related_ids = _sample["igsn:relatedResourceIdentifiers"][ "igsn:relatedIdentifier"] if isinstance(_related_ids, dict): _related_ids = [ _related_ids, ] except KeyError: _L.debug("No related identifiers in record") for related_id in _related_ids: entry = {} entry["id"] = related_id.get("#text", "") entry["id_type"] = related_id.get("@relatedIdentifierType", "") entry["rel_type"] = related_id.get("@relationType", "") data["related"].append(entry) return data
def reloadThing(thing): """Given an instance of thing, reload from the origin source.""" L = getLogger() L.debug("reloadThing id=%s", thing.id) identifier = igsn_lib.normalize(thing.id) return loadThing(identifier, thing.tcreated)
def test_normalize(igsn_str, expected): result = igsn_lib.normalize(igsn_str) assert result == expected
def parse(ctx, igsn_str): print(igsn_lib.normalize(igsn_str)) return 0