Example #1
0
def resolve(ctx, igsn_str, accept, url_only, show_steps, use_n2t):
    """
    Show results of resolving an IGSN

    Args:
        ctx: The click context passed in from main
        igsn_str: The IGSN string
        accept: optional accept header value

    Returns:
        outputs response information to stdout

    Examples::

        $ igsn resolve 10273/847000106
        https://app.geosamples.org/webservices/display.php?igsn=847000106

    """
    L = getLogger()
    if igsn_str is None:
        L.error("IGSN value is required")
        return 1
    headers = None
    if not accept is None:
        headers = {
            "Accept": accept,
        }
    if use_n2t:
        identifier = igsn_str
        igsn_val = igsn_lib.normalize(igsn_str)
        if igsn_val is not None:
            identifier = f"IGSN:{igsn_val}"
        responses = igsn_lib.resolveN2T(identifier, headers=headers)
    else:
        igsn_val = igsn_lib.normalize(igsn_str)
        L.info("Normalized IGSN = %s", igsn_val)
        if igsn_val is None:
            L.error("Provided identifier not recogized as an IGSN")
            return 1
        responses = igsn_lib.resolve(igsn_val,
                                     include_body=not url_only,
                                     headers=headers)
    # Trim space and make upper case
    if show_steps:
        nsteps = len(responses)
        print("History:")
        cntr = 1
        for r in responses:
            print(f"Step {cntr}/{nsteps}:")
            dumpResponse(r, indent="  ")
            cntr += 1
        #print(f"Step {cntr}/{nsteps}:")
        #dumpResponse(response, indent="  ")
    if url_only:
        print(f"{responses[-1].url}")
        return 0
    dumpResponseBody(responses[-1])
    return 0
Example #2
0
async def _loadSesarEntries(session, max_count, start_from=None):
    L = getLogger()
    futures = []
    working = {}
    ids = isb_lib.sesar_adapter.SESARIdentifiersSitemap(
        max_entries=countThings(session) + max_count, date_start=start_from
    )
    total_requested = 0
    total_completed = 0
    more_work = True
    num_prepared = BACKLOG_SIZE  # Number of jobs to prepare for execution
    with concurrent.futures.ThreadPoolExecutor(
        max_workers=CONCURRENT_DOWNLOADS
    ) as executor:
        while more_work:
            # populate the futures list with work until the list is full
            # or there is no more work to get.
            while (
                len(futures) < BACKLOG_SIZE
                and total_requested < max_count
                and num_prepared > 0
            ):
                try:
                    _id = next(ids)
                    igsn = igsn_lib.normalize(_id[0])
                    existing_thing = sqlmodel_database.get_thing_with_id(session, isb_lib.sesar_adapter.fullIgsn(igsn))
                    if existing_thing is not None:
                        logging.info("Already have %s at %s", igsn, existing_thing)
                        future = executor.submit(wrapLoadThing, igsn, _id[1], existing_thing)
                    else:
                        future = executor.submit(wrapLoadThing, igsn, _id[1])
                    futures.append(future)
                    working[igsn] = 0
                    total_requested += 1
                except StopIteration as e:
                    L.info("Reached end of identifier iteration.")
                    num_prepared = 0
                if total_requested >= max_count:
                    num_prepared = 0
            L.debug("%s", working)
            try:
                for fut in concurrent.futures.as_completed(futures, timeout=1):
                    igsn, tc, _thing = fut.result()
                    futures.remove(fut)
                    if not _thing is None:
                        try:
                            session.add(_thing)
                            session.commit()
                        except sqlalchemy.exc.IntegrityError as e:
                            session.rollback()
                            logging.error("Item already exists: %s", _id[0])
                        # for _rel in _related:
                        #    try:
                        #        session.add(_rel)
                        #        session.commit()
                        #    except sqlalchemy.exc.IntegrityError as e:
                        #        L.debug(e)
                        working.pop(igsn)
                        total_completed += 1
                    else:
                        if working.get(igsn, 0) < 3:
                            if not igsn in working:
                                working[igsn] = 1
                            else:
                                working[igsn] += 1
                            L.info(
                                "Failed to retrieve %s. Retry = %s", igsn, working[igsn]
                            )
                            future = executor.submit(wrapLoadThing, igsn, tc)
                            futures.append(future)
                        else:
                            L.error("Too many retries on %s", igsn)
                            working.pop(igsn)
            except concurrent.futures.TimeoutError:
                # L.info("No futures to process")
                pass
            if len(futures) == 0 and num_prepared == 0:
                more_work = False
            if total_completed >= max_count:
                more_work = False
            L.info(
                "requested, completed, current = %s, %s, %s",
                total_requested,
                total_completed,
                len(futures),
            )
Example #3
0
def oaiRecordToDict(xml_string):
    '''
    Converts an OAI-PMH IGSN metadata record to a dict

    The IGSN schema is at https://doidb.wdc-terra.org/igsn/schemas/igsn.org/schema/1.0/igsn.xsd

    Times are returned as timezone aware python datetime, TZ=UTC.

    Args:
        raw_record: OAI-PMH record XML in IGSN format

    Returns:
        dict or None on failure

    Example:

        .. jupyter-execute::

           import pprint
           import igsn_lib.oai

           xml = """<?xml version="1.0"?>
            <record xmlns="http://www.openarchives.org/OAI/2.0/"
                    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
              <header>
                <identifier>oai:registry.igsn.org:6940929</identifier>
                <datestamp>2019-10-15T06:00:10Z</datestamp>
                <setSpec>IEDA</setSpec>
                <setSpec>IEDA.SESAR</setSpec>
              </header>
              <metadata>
                <sample xmlns="http://igsn.org/schema/kernel-v.1.0"
                        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                        xsi:schemaLocation="http://igsn.org/schema/kernel-v.1.0 http://doidb.wdc-terra.org/igsn/schemas/igsn.org/schema/1.0/igsn.xsd">
                  <sampleNumber identifierType="igsn">10273/BSU0005JF</sampleNumber>
                  <registrant>
                    <registrantName>IEDA</registrantName>
                  </registrant>
                  <log>
                    <logElement event="submitted" timeStamp="2019-10-15T04:00:09Z"/>
                  </log>
                </sample>
              </metadata>
            </record>
           """
           data = igsn_lib.oai.oaiRecordToDict(xml)
           pprint.pprint(data, indent=2)
    '''
    _L = _getLogger()
    data = {
        "igsn_id": None,  # Value of the IGSN identifier
        "oai_id": None,  # Internal OAI-PMH identifier of this record
        "registrant": None,  # registrant name
        "oai_time": None,  # time stamp on the OAI record
        "igsn_time": None,  # submitted or registered time in the log
        "set_spec": [],  # list of setSpec entries for record
        "log": [],  # list of log entries
        "related": [],  # list of related identifiers
        "_source": {},
    }
    try:
        data["_source"] = xmltodict.parse(xml_string,
                                          process_namespaces=True,
                                          namespaces=IGSN_OAI_NAMESPACES)
    except Exception as e:
        _L.error(e)
        return None
    # _L.debug(json.dumps(data["_source"], indent=2))
    data["oai_id"] = data["_source"]["oai:record"]["oai:header"][
        "oai:identifier"]
    # Always store time in UTC
    data["oai_time"] = dateparser.parse(
        data["_source"]["oai:record"]["oai:header"]["oai:datestamp"],
        settings={"TIMEZONE": "+0000"},
    )
    _sample = data["_source"]["oai:record"]["oai:metadata"]["igsn:sample"]
    igsn_id = _sample["igsn:sampleNumber"]["#text"]
    data["igsn_id"] = igsn_lib.normalize(igsn_id)
    data["registrant"] = _sample["igsn:registrant"]["igsn:registrantName"]
    data["set_spec"] = data["_source"]["oai:record"]["oai:header"][
        "oai:setSpec"]
    # log 'events':
    #   https://doidb.wdc-terra.org//igsn/schemas/igsn.org/schema/1.0/include/igsn-eventType-v1.0.xsd
    igsn_log = _sample["igsn:log"]["igsn:logElement"]
    if isinstance(igsn_log, dict):
        igsn_log = [
            igsn_log,
        ]
    data["log"] = []
    igsn_time = None
    for _log in igsn_log:
        _event = _log["@event"].lower().strip()
        _time = dateparser.parse(
            _log["@timeStamp"],
            settings={
                "TIMEZONE": "+0000",
                "RETURN_AS_TIMEZONE_AWARE": True
            },
        )
        data["log"].append({
            "event":
            _event,
            "time":
            _time.strftime(igsn_lib.time.JSON_TIME_FORMAT)
        })
        if _event == "submitted":
            igsn_time = _time
        if _event == "registered":
            # Use registered time if submitted not available
            if igsn_time is None:
                igsn_time = _time
        if _event == "updated":
            # Fall back to updated time
            if igsn_time is None:
                igsn_time = _time
    data["igsn_time"] = igsn_time
    _related_ids = []
    try:
        _related_ids = _sample["igsn:relatedResourceIdentifiers"][
            "igsn:relatedIdentifier"]
        if isinstance(_related_ids, dict):
            _related_ids = [
                _related_ids,
            ]
    except KeyError:
        _L.debug("No related identifiers in record")
    for related_id in _related_ids:
        entry = {}
        entry["id"] = related_id.get("#text", "")
        entry["id_type"] = related_id.get("@relatedIdentifierType", "")
        entry["rel_type"] = related_id.get("@relationType", "")
        data["related"].append(entry)
    return data
def reloadThing(thing):
    """Given an instance of thing, reload from the origin source."""
    L = getLogger()
    L.debug("reloadThing id=%s", thing.id)
    identifier = igsn_lib.normalize(thing.id)
    return loadThing(identifier, thing.tcreated)
Example #5
0
def test_normalize(igsn_str, expected):
    result = igsn_lib.normalize(igsn_str)
    assert result == expected
Example #6
0
def parse(ctx, igsn_str):
    print(igsn_lib.normalize(igsn_str))
    return 0