Example #1
0
def get_doc_prov(j, gcis_url, refList):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    doc = ProvEsDocument()
    name  = " ".join(j[i] 
        for i in ('first_name', 'middle_name', 'last_name')
        if j.get(i, None) is not None)
 
    doc_attrs =[("prov:type", 'gcis:Person'),
        ("prov:label", name),#j['first_name']),
        ("prov:location", "%s%s"%(gcis_url,j['uri'])),
        ("gcis:id", j['id']),
        ("gcis:orcid", j["orcid"]),
        #("prov:wasAttributedTo, contributors),
        ]

    doc.agent('bibo:%s' % j['id'], doc_attrs)
    del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(j['id'], None, None))]

    doc.delegation('bibo:%s'%j['id'], None, None, del_id, None)
    #for org_id in agent_ids[agent_id]:
    #   del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
    #doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})


    prov_json = json.loads(doc.serialize())

    return prov_json
Example #2
0
def fix_hadMember_ids(prov_es_json):
    """Fix the id's of hadMember relationships."""

    hm_ids = prov_es_json.get('hadMember', {}).keys()
    for id in hm_ids:
        hm = copy.deepcopy(prov_es_json['hadMember'][id])
        new_id = "hysds:%s" % get_uuid("%s:%s" % (hm['prov:collection'], hm['prov:entity']))
        prov_es_json['hadMember'][new_id] = hm
        del prov_es_json['hadMember'][id]
Example #3
0
def fix_hadMember_ids(prov_es_json):
    """Fix the id's of hadMember relationships."""

    hm_ids = prov_es_json.get('hadMember', {}).keys()
    for id in hm_ids:
        hm = copy.deepcopy(prov_es_json['hadMember'][id])
        new_id = "hysds:%s" % get_uuid(
            "%s:%s" % (hm['prov:collection'], hm['prov:entity']))
        prov_es_json['hadMember'][new_id] = hm
        del prov_es_json['hadMember'][id]
Example #4
0
def log_publish_prov_es(prov_es_info, prov_es_file, prod_path, pub_urls,
                        prod_metrics, objectid):
    """Log publish step in PROV-ES document."""

    # create PROV-ES doc
    doc = ProvEsDocument(namespaces=prov_es_info['prefix'])

    # get bundle
    #bndl = doc.bundle(bundle_id)
    bndl = None

    # add input entity
    execute_node = socket.getfqdn()
    prod_url = "file://%s%s" % (execute_node, prod_path)
    input_id = "hysds:%s" % get_uuid(prod_url)
    input_ent = doc.granule(input_id,
                            None, [prod_url], [],
                            None,
                            None,
                            None,
                            label=os.path.basename(prod_url),
                            bundle=bndl)

    # add output entity
    output_id = "hysds:%s" % get_uuid(pub_urls[0])
    output_ent = doc.product(output_id,
                             None, [pub_urls[0]], [],
                             None,
                             None,
                             None,
                             label=objectid,
                             bundle=bndl)

    # software and algorithm
    algorithm = "eos:product_publishing"
    software_version = hysds.__version__
    software_title = "%s v%s" % (hysds.__description__, software_version)
    software = "eos:HySDS-%s" % software_version
    software_location = hysds.__url__
    doc.software(software, [algorithm],
                 software_version,
                 label=software_title,
                 location=software_location,
                 bundle=bndl)

    # create sofware agent
    pid = os.getpid()
    sa_label = "hysds:publish_dataset/%s/%d/%s" % (execute_node, pid,
                                                   prod_metrics['time_start'])
    sa_id = "hysds:%s" % get_uuid(sa_label)
    doc.softwareAgent(sa_id,
                      str(pid),
                      execute_node,
                      role="invoked",
                      label=sa_label,
                      bundle=bndl)

    # create processStep
    job_id = "publish_dataset-%s" % os.path.basename(prod_path)
    doc.processStep("hysds:%s" % get_uuid(job_id),
                    prod_metrics['time_start'],
                    prod_metrics['time_end'], [software],
                    sa_id,
                    None, [input_id], [output_id],
                    label=job_id,
                    bundle=bndl,
                    prov_type="hysds:publish_dataset")

    # get json
    pd = json.loads(doc.serialize())

    # update input entity
    orig_ent = prov_es_info.get('entity', {}).get(input_id, {})
    pd['entity'][input_id].update(orig_ent)

    # update output entity
    for attr in orig_ent:
        if attr in ('prov:location', 'prov:label', 'prov:type'):
            continue
        pd['entity'][output_id][attr] = orig_ent[attr]

    # write prov
    with open(prov_es_file, 'w') as f:
        json.dump(pd, f, indent=2)
Example #5
0
def log_prov_es(job, prov_es_info, prov_es_file):
    """Log PROV-ES document. Create temp PROV-ES document to populate 
       attributes that only the worker has access to (e.g. PID)."""

    # create PROV-ES doc to generate attributes that only verdi know
    ps_id = "hysds:%s" % get_uuid(job['job_id'])
    bundle_id = "hysds:%s" % get_uuid('bundle-%s' % job['job_id'])
    doc = ProvEsDocument()

    # get bundle
    #bndl = doc.bundle(bundle_id)
    bndl = None

    # create sofware agent
    sa_label = "hysds:pge_wrapper/%s/%d/%s" % (job['job_info']['execute_node'],
                                               job['job_info']['pid'],
                                               datetime.utcnow().isoformat())
    sa_id = "hysds:%s" % get_uuid(sa_label)
    doc.softwareAgent(sa_id,
                      str(job['job_info']['pid']),
                      job['job_info']['execute_node'],
                      role=job.get('username', None),
                      label=sa_label,
                      bundle=bndl)

    # create processStep
    doc.processStep(ps_id,
                    job['job_info']['cmd_start'],
                    job['job_info']['cmd_end'], [],
                    sa_id,
                    None, [], [],
                    bundle=bndl,
                    prov_type="hysds:%s" % job['type'])

    # get json
    pd = json.loads(doc.serialize())

    # update software agent and process step
    if 'bundle' in prov_es_info:
        if len(prov_es_info['bundle']) == 1:
            bundle_id_orig = list(prov_es_info['bundle'].keys())[0]

            # update software agent
            prov_es_info['bundle'][bundle_id_orig].setdefault(
                'agent', {}).update(pd['bundle'][bundle_id]['agent'])

            # update wasAssociatedWith
            prov_es_info['bundle'][bundle_id_orig].setdefault(
                'wasAssociatedWith',
                {}).update(pd['bundle'][bundle_id]['wasAssociatedWith'])

            # update activity
            if 'activity' in prov_es_info['bundle'][bundle_id_orig]:
                if len(prov_es_info['bundle'][bundle_id_orig]
                       ['activity']) == 1:
                    ps_id_orig = list(prov_es_info['bundle'][bundle_id_orig]
                                      ['activity'].keys())[0]
                    prov_es_info['bundle'][bundle_id_orig]['activity'][
                        ps_id_orig]['prov:startTime'] = pd['bundle'][
                            bundle_id]['activity'][ps_id]['prov:startTime']
                    prov_es_info['bundle'][bundle_id_orig]['activity'][
                        ps_id_orig]['prov:endTime'] = pd['bundle'][bundle_id][
                            'activity'][ps_id]['prov:endTime']
                    prov_es_info['bundle'][bundle_id_orig]['activity'][
                        ps_id_orig]['hysds:job_id'] = job['job_id']
                    prov_es_info['bundle'][bundle_id_orig]['activity'][
                        ps_id_orig]['hysds:job_type'] = job['type']
                    prov_es_info['bundle'][bundle_id_orig]['activity'][
                        ps_id_orig]['hysds:job_url'] = job['job_info'][
                            'job_url']
                    prov_es_info['bundle'][bundle_id_orig]['activity'][
                        ps_id_orig]['hysds:mozart_url'] = app.conf.MOZART_URL
                    if 'prov:type' not in prov_es_info['bundle'][
                            bundle_id_orig]['activity'][ps_id_orig]:
                        prov_es_info['bundle'][bundle_id_orig]['activity'][
                            ps_id_orig]['prov:type'] = pd['bundle'][bundle_id][
                                'activity'][ps_id]['prov:type']

                    # update wasAssociatedWith activity ids
                    for waw_id in prov_es_info['bundle'][bundle_id_orig][
                            'wasAssociatedWith']:
                        if prov_es_info['bundle'][bundle_id_orig][
                                'wasAssociatedWith'][waw_id][
                                    'prov:activity'] == ps_id:
                            prov_es_info['bundle'][bundle_id_orig][
                                'wasAssociatedWith'][waw_id][
                                    'prov:activity'] = ps_id_orig
                else:
                    prov_es_info['bundle'][bundle_id_orig]['activity'].update(
                        pd['bundle'][bundle_id]['activity'])
            else:
                prov_es_info['bundle'][bundle_id_orig]['activity'] = pd[
                    'bundle'][bundle_id]['activity']
    else:
        # update software agent
        prov_es_info.setdefault('agent', {}).update(pd['agent'])

        # update wasAssociatedWith
        prov_es_info.setdefault('wasAssociatedWith',
                                {}).update(pd['wasAssociatedWith'])

        # update process step
        if 'activity' in prov_es_info:
            if len(prov_es_info['activity']) == 1:
                ps_id_orig = list(prov_es_info['activity'].keys())[0]
                prov_es_info['activity'][ps_id_orig]['prov:startTime'] = pd[
                    'activity'][ps_id]['prov:startTime']
                prov_es_info['activity'][ps_id_orig]['prov:endTime'] = pd[
                    'activity'][ps_id]['prov:endTime']
                prov_es_info['activity'][ps_id_orig]['hysds:job_id'] = job[
                    'job_id']
                prov_es_info['activity'][ps_id_orig]['hysds:job_type'] = job[
                    'type']
                prov_es_info['activity'][ps_id_orig]['hysds:job_url'] = job[
                    'job_info']['job_url']
                prov_es_info['activity'][ps_id_orig][
                    'hysds:mozart_url'] = app.conf.MOZART_URL
                if 'prov:type' not in prov_es_info['activity'][ps_id_orig]:
                    prov_es_info['activity'][ps_id_orig]['prov:type'] = pd[
                        'activity'][ps_id]['prov:type']

                # update wasAssociatedWith activity ids
                for waw_id in prov_es_info['wasAssociatedWith']:
                    if prov_es_info['wasAssociatedWith'][waw_id][
                            'prov:activity'] == ps_id:
                        prov_es_info['wasAssociatedWith'][waw_id][
                            'prov:activity'] = ps_id_orig
            else:
                prov_es_info['activity'].update(pd['activity'])
        else:
            prov_es_info['activity'] = pd['activity']

    # write prov
    with open(prov_es_file, 'w') as f:
        json.dump(prov_es_info, f, indent=2)
Example #6
0
def create_prov_es_json(id, url, prod_dir, prov_file):
    """Create provenance JSON file."""

    # get info
    csk_files = glob(os.path.join(prod_dir, "CSKS*"))
    pf = "CSKS?"
    dtype = "NA"
    repo_dir = "?"
    if len(csk_files) > 0:
        match = CSK_RE.search(os.path.basename(csk_files[0]))
        if match: pf, dtype = match.groups()
        if dtype == "RAW":
            dtype = "RAW_B"
            repo_dir = "csk_rawb"
        elif dtype == "SCS":
            dtype = "SCS_B"
            repo_dir = "csk_scsb"
    platform = "eos:%s" % pf
    platform_title = "COSMO-SkyMed Satellite %s" % pf[-1]
    instrument = "eos:%s-SAR" % pf
    instrument_title = "%s-SAR" % pf
    level = "L0"
    version = "v1.0"
    collection = "eos:CSK-%s-%s" % (dtype, version)
    collection_shortname = "CSK-%s-%s" % (dtype, version)
    collection_label = "CSK %s Scenes %s" % (dtype, version)
    collection_loc = "https://aria-dav.jpl.nasa.gov/repository/products/%s/%s" % (
        repo_dir, version)
    sensor = "eos:SAR"
    sensor_title = "Synthetic-aperture radar (SAR)"
    gov_org = "eos:ASI"
    gov_org_title = "Agenzia Spaziale Italiana"
    software_version = "2.0.0_201604"
    software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version
    software = "eos:ISCE-%s" % software_version
    software_location = "https://winsar.unavco.org/isce.html"
    algorithm = "eos:metadata_extraction"
    prod_dir = "file://%s%s" % (socket.getfqdn(), prod_dir)

    # put in fake start/end times so that prov:used and prov:generated
    # are properly created by the prov lib
    fake_time = datetime.utcnow().isoformat() + 'Z'
    job_id = "ingest-%s-%s" % (id, fake_time)
    bundle_id = "bundle-ingest-%s-%s" % (id, fake_time)

    doc = ProvEsDocument()
    #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id))
    bndl = None
    input_id = "hysds:%s" % get_uuid(url)
    input_ds = doc.granule(input_id,
                           None, [url], [instrument],
                           None,
                           level,
                           None,
                           label=os.path.basename(url),
                           bundle=bndl)
    doc.collection(collection,
                   None,
                   collection_shortname,
                   collection_label, [collection_loc], [instrument],
                   level,
                   version,
                   label=collection_label,
                   bundle=bndl)
    output_id = "hysds:%s" % get_uuid(prod_dir)
    output_ds = doc.granule(output_id,
                            None, [prod_dir], [instrument],
                            collection,
                            level,
                            version,
                            label=id,
                            bundle=bndl)
    doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl)
    doc.platform(platform, [instrument], label=platform_title, bundle=bndl)
    doc.instrument(instrument,
                   platform, [sensor], [gov_org],
                   label=instrument_title,
                   bundle=bndl)
    doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl)
    doc.software(software, [algorithm],
                 software_version,
                 label=software_title,
                 location=software_location,
                 bundle=bndl)
    doc.processStep("hysds:%s" % get_uuid(job_id),
                    fake_time,
                    fake_time, [software],
                    None,
                    None, [input_ds.identifier], [output_ds.identifier],
                    label=job_id,
                    bundle=bndl,
                    prov_type="hysds:ingest")

    with open(prov_file, 'w') as f:
        json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
Example #7
0
def get_doc_prov(j, gcis_url):
    """Generate PROV-ES JSON from GCIS doc metadata."""

    # create doc
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument(namespaces={
        "gcis": gcis_ns,
        "bibo": "http://purl.org/ontology/bibo/"
    })
    bndl = None

    # create journal
    r = requests.get("%s/journal/%s.json" %
                     (gcis_url, j['journal_identifier']),
                     params={'all': 1},
                     verify=False)
    r.raise_for_status()
    journal_md = r.json()
    doc_attrs = [
        ("prov:type", 'gcis:Journal'),
        ("prov:label", j['title']),
    ]
    journal_id = GCIS[j['journal_identifier']]
    if journal_id not in journal_ids:
        if journal_md.get('url', None) is not None:
            doc_attrs.append(("prov:location", journal_md['url']))
        if journal_md.get('online_issn', None) is not None:
            doc_attrs.append(("gcis:online_issn", journal_md['online_issn']))
        if journal_md.get('print_issn', None) is not None:
            doc_attrs.append(("gcis:print_issn", journal_md['print_issn']))
        doc.entity(journal_id, doc_attrs)
        journal_ids[journal_id] = True

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent
            agent_name = " ".join([
                cont['person'][i]
                for i in ('first_name', 'middle_name', 'last_name')
                if cont['person'].get(i, None) is not None
            ])
            doc.agent(agent_id, [
                (PROV_TYPE, GCIS["Person"]),
                (PROV_LABEL, agent_name),
                (PROV_LOCATION, "%s%s" % (gcis_url, cont['uri'])),
            ])
            agent_ids[agent_id] = []

        # organization
        if cont['organization'] is not None and len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create article
    article_id = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ("prov:type", 'gcis:Article'),
        ("prov:label", j['title']),
        ("dcterms:isPartOf", journal_id),
    ]
    if j.get('doi', "") == "":
        doc_attrs.append(("bibo:doi", j['doi']))
    doc.entity(article_id, doc_attrs)

    # link
    doc.hadMember(journal_id, article_id)

    # create activity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s" % j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id,
                              {'prov:role': GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s" % get_uuid("%s:%s:%s" %
                                          (agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id,
                           {'prov:type': 'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id,
                              {'prov:role': GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(article_id, act, end_time,
                       GCIS["%s" % get_uuid("%s:%s" % (article_id, act_id))])

    # serialize
    prov_json = json.loads(doc.serialize())

    return prov_json
Example #8
0
def create_prov_es_json(id, project, master_orbit_file, slave_orbit_file,
                        aria_dem_xml, aria_dem_file, work_dir, prov_file):
    """Create provenance JSON file."""

    # get abs paths
    work_dir = os.path.abspath(work_dir)
    prod_dir = os.path.join(work_dir, id)

    # get context
    ctx_file = os.path.join(prod_dir, "%s.context.json" % id)
    with open(ctx_file) as f:
        context = json.load(f)

    # put in fake start/end times so that prov:used and prov:generated
    # are properly created by the prov lib
    fake_time = datetime.utcnow().isoformat() + 'Z'
    job_id = "create_interferogram-%s" % fake_time
    bundle_id = "bundle-create_interferogram-%s" % fake_time

    # create PROV-ES doc
    doc = ProvEsDocument()
    #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id))
    bndl = None

    # input and output identifiers
    input_ids = {}
    platform_ids = {}
    instrument_ids = {}

    # full url paths
    work_url = "file://%s%s" % (socket.getfqdn(), work_dir)
    prod_url = "%s/%s" % (work_url, id)

    # add sentinel.ini file
    ini_ent = doc.file("hysds:%s" % get_uuid("%s/sentinel.ini" % work_url),
                       ["%s/sentinel.ini" % work_url],
                       label="sentinel.ini")
    input_ids[ini_ent.identifier] = True

    # add orbit files
    master_orbit_ent = doc.file(
        "hysds:%s" % get_uuid("%s/%s" % (work_url, master_orbit_file)),
        ["%s/%s" % (work_url, master_orbit_file)],
        label=os.path.basename(master_orbit_file))
    input_ids[master_orbit_ent.identifier] = True
    slave_orbit_ent = doc.file(
        "hysds:%s" % get_uuid("%s/%s" % (work_url, slave_orbit_file)),
        ["%s/%s" % (work_url, slave_orbit_file)],
        label=os.path.basename(slave_orbit_file))
    input_ids[slave_orbit_ent.identifier] = True

    # get list of S1A urls
    level = "L0"
    version = "v1.0"
    sensor = "eos:SAR"
    sensor_title = "Synthetic-aperture radar (SAR)"
    gov_org = "eos:ESA"
    gov_org_title = "European Space Agency"
    doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl)
    instrument = ""
    for i, url in enumerate(
        [context.get('master_zip_url', ''),
         context.get('slave_zip_url', '')]):
        match = PLATFORM_RE.search(url)
        if not match: continue
        pf = match.group(1)
        platform = "eos:%s" % pf
        platform_title = "Sentinel1A Satellite"
        instrument = "eos:%s-SAR" % pf
        instrument_title = "%s-SAR" % pf
        input_ds = doc.product("hysds:%s" % get_uuid(url),
                               None, [url], [instrument],
                               None,
                               level,
                               None,
                               label=os.path.basename(url),
                               bundle=bndl)
        input_ids[input_ds.identifier] = True
        if platform not in platform_ids:
            doc.platform(platform, [instrument],
                         label=platform_title,
                         bundle=bndl)
            platform_ids[platform] = True
        if instrument not in instrument_ids:
            doc.instrument(instrument,
                           platform, [sensor], [gov_org],
                           label=instrument_title,
                           bundle=bndl)
            doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl)
            instrument_ids[instrument] = True

    # add dem xml, file and related provenance
    srtm_platform = "eos:SpaceShuttleEndeavour"
    srtm_platform_title = "USS Endeavour"
    srtm_instrument = "eos:SRTM"
    srtm_instrument_title = "Shuttle Radar Topography Mission (SRTM)"
    srtm_sensor = "eos:radar"
    srtm_sensor_title = "radar"
    srtm_gov_org = "eos:JPL"
    srtm_gov_org_title = "Jet Propulsion Laboratory"
    doc.governingOrganization(srtm_gov_org,
                              label=srtm_gov_org_title,
                              bundle=bndl)
    dem_xml_ent = doc.file("hysds:%s" % get_uuid("%s/%s" %
                                                 (work_url, aria_dem_xml)),
                           ["%s/%s" % (work_url, aria_dem_xml)],
                           label=os.path.basename(aria_dem_xml))
    input_ids[dem_xml_ent.identifier] = True
    dem_file_ent = doc.file("hysds:%s" % get_uuid("%s/%s" %
                                                  (work_url, aria_dem_file)),
                            ["%s/%s" % (work_url, aria_dem_file)],
                            label=os.path.basename(aria_dem_file))
    input_ids[dem_file_ent.identifier] = True
    doc.platform(srtm_platform, [srtm_instrument],
                 label=srtm_platform_title,
                 bundle=bndl)
    doc.instrument(srtm_instrument,
                   srtm_platform, [srtm_sensor], [srtm_gov_org],
                   label=srtm_instrument_title,
                   bundle=bndl)
    doc.sensor(srtm_sensor,
               srtm_instrument,
               label=srtm_sensor_title,
               bundle=bndl)
    instrument_ids[srtm_instrument] = True

    # software and algorithm
    algorithm = "eos:interferogram_generation"
    software_version = "2.0.0_201604"
    software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version
    software = "eos:ISCE-%s" % software_version
    software_location = "https://winsar.unavco.org/isce.html"
    doc.software(software, [algorithm],
                 software_version,
                 label=software_title,
                 location=software_location,
                 bundle=bndl)

    # output
    int_level = "L2"
    int_version = "v1.0"
    int_collection = "eos:S1A-interferograms-%s" % int_version
    int_collection_shortname = "S1A-interferograms-%s" % int_version
    int_collection_label = "ISCE generated S1A interferograms %s" % int_version
    int_collection_loc = "https://aria-dst-dav.jpl.nasa.gov/products/s1a_ifg/%s" % int_version
    doc.collection(int_collection,
                   None,
                   int_collection_shortname,
                   int_collection_label, [int_collection_loc],
                   instrument_ids.keys(),
                   int_level,
                   int_version,
                   label=int_collection_label,
                   bundle=bndl)
    output_ds = doc.granule("hysds:%s" % get_uuid(prod_url),
                            None, [prod_url],
                            instrument_ids.keys(),
                            int_collection,
                            int_level,
                            int_version,
                            label=id,
                            bundle=bndl)

    # runtime context
    rt_ctx_id = "hysds:runtimeContext-sentinel_ifg-%s" % project
    doc.runtimeContext(rt_ctx_id, [project], label=project, bundle=bndl)

    # create process
    doc.processStep("hysds:%s" % get_uuid(job_id),
                    fake_time,
                    fake_time, [software],
                    None,
                    rt_ctx_id,
                    input_ids.keys(), [output_ds.identifier],
                    label=job_id,
                    bundle=bndl,
                    prov_type="hysds:create_interferogram")

    # write
    with open(prov_file, 'w') as f:
        json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
def get_doc_prov(j, gcis_url):
    """Generate PROV-ES JSON from GCIS doc metadata."""

    # create doc
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument(namespaces={ "gcis": gcis_ns, "bibo": "http://purl.org/ontology/bibo/" })
    bndl = None

    # create journal
    r = requests.get("%s/journal/%s.json" % (gcis_url, j['journal_identifier']), params={ 'all': 1 }, verify=False)
    r.raise_for_status()
    journal_md = r.json()
    doc_attrs = [
        ( "prov:type", 'gcis:Journal' ),
        ( "prov:label", j['title'] ),
    ]
    journal_id = GCIS[j['journal_identifier']]
    if journal_id not in journal_ids:
        if journal_md.get('url', None) is not None:
            doc_attrs.append( ("prov:location", journal_md['url'] ) )
        if journal_md.get('online_issn', None) is not None:
            doc_attrs.append( ("gcis:online_issn", journal_md['online_issn'] ) )
        if journal_md.get('print_issn', None) is not None:
            doc_attrs.append( ("gcis:print_issn", journal_md['print_issn'] ) )
        doc.entity(journal_id, doc_attrs)
        journal_ids[journal_id] = True

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent 
            agent_name  = " ".join([cont['person'][i] for i in
                                    ('first_name', 'middle_name', 'last_name')
                                    if cont['person'].get(i, None) is not None])
            doc.agent(agent_id, [
                ( PROV_TYPE, GCIS["Person"] ),
                ( PROV_LABEL, agent_name ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ),
            ])
            agent_ids[agent_id] = []

        # organization
        if cont['organization'] is not None and len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:          
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create article
    article_id = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ( "prov:type", 'gcis:Article' ),
        ( "prov:label", j['title'] ),
        ( "dcterms:isPartOf", journal_id ),
    ]
    if j.get('doi', "") == "": 
        doc_attrs.append( ("bibo:doi", j['doi'] ) )
    doc.entity(article_id, doc_attrs)

    # link
    doc.hadMember(journal_id, article_id)

    # create activity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s" % j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': 'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(article_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (article_id, act_id))])

    # serialize
    prov_json = json.loads(doc.serialize())

    return prov_json
Example #10
0
def log_prov_es(job, prov_es_info, prov_es_file):
    """Log PROV-ES document. Create temp PROV-ES document to populate
    attributes that only the worker has access to (e.g. PID)."""

    # create PROV-ES doc to generate attributes that only verdi know
    ps_id = "hysds:%s" % get_uuid(job["job_id"])
    bundle_id = "hysds:%s" % get_uuid("bundle-%s" % job["job_id"])
    doc = ProvEsDocument()

    # get bundle
    # bndl = doc.bundle(bundle_id)
    bndl = None

    # create sofware agent
    sa_label = "hysds:pge_wrapper/%s/%d/%s" % (
        job["job_info"]["execute_node"],
        job["job_info"]["pid"],
        datetime.utcnow().isoformat(),
    )
    sa_id = "hysds:%s" % get_uuid(sa_label)
    doc.softwareAgent(
        sa_id,
        str(job["job_info"]["pid"]),
        job["job_info"]["execute_node"],
        role=job.get("username", None),
        label=sa_label,
        bundle=bndl,
    )

    # create processStep
    doc.processStep(
        ps_id,
        job["job_info"]["cmd_start"],
        job["job_info"]["cmd_end"],
        [],
        sa_id,
        None,
        [],
        [],
        bundle=bndl,
        prov_type="hysds:%s" % job["type"],
    )

    # get json
    pd = json.loads(doc.serialize())

    # update software agent and process step
    if "bundle" in prov_es_info:
        if len(prov_es_info["bundle"]) == 1:
            bundle_id_orig = list(prov_es_info["bundle"].keys())[0]

            # update software agent
            prov_es_info["bundle"][bundle_id_orig].setdefault(
                "agent", {}).update(pd["bundle"][bundle_id]["agent"])

            # update wasAssociatedWith
            prov_es_info["bundle"][bundle_id_orig].setdefault(
                "wasAssociatedWith",
                {}).update(pd["bundle"][bundle_id]["wasAssociatedWith"])

            # update activity
            if "activity" in prov_es_info["bundle"][bundle_id_orig]:
                if len(prov_es_info["bundle"][bundle_id_orig]
                       ["activity"]) == 1:
                    ps_id_orig = list(prov_es_info["bundle"][bundle_id_orig]
                                      ["activity"].keys())[0]
                    prov_es_info["bundle"][bundle_id_orig]["activity"][
                        ps_id_orig]["prov:startTime"] = pd["bundle"][
                            bundle_id]["activity"][ps_id]["prov:startTime"]
                    prov_es_info["bundle"][bundle_id_orig]["activity"][
                        ps_id_orig]["prov:endTime"] = pd["bundle"][bundle_id][
                            "activity"][ps_id]["prov:endTime"]
                    prov_es_info["bundle"][bundle_id_orig]["activity"][
                        ps_id_orig]["hysds:job_id"] = job["job_id"]
                    prov_es_info["bundle"][bundle_id_orig]["activity"][
                        ps_id_orig]["hysds:job_type"] = job["type"]
                    prov_es_info["bundle"][bundle_id_orig]["activity"][
                        ps_id_orig]["hysds:job_url"] = job["job_info"][
                            "job_url"]
                    prov_es_info["bundle"][bundle_id_orig]["activity"][
                        ps_id_orig]["hysds:mozart_url"] = app.conf.MOZART_URL
                    if ("prov:type" not in prov_es_info["bundle"]
                        [bundle_id_orig]["activity"][ps_id_orig]):
                        prov_es_info["bundle"][bundle_id_orig]["activity"][
                            ps_id_orig]["prov:type"] = pd["bundle"][bundle_id][
                                "activity"][ps_id]["prov:type"]

                    # update wasAssociatedWith activity ids
                    for waw_id in prov_es_info["bundle"][bundle_id_orig][
                            "wasAssociatedWith"]:
                        if (prov_es_info["bundle"][bundle_id_orig]
                            ["wasAssociatedWith"][waw_id]["prov:activity"] ==
                                ps_id):
                            prov_es_info["bundle"][bundle_id_orig][
                                "wasAssociatedWith"][waw_id][
                                    "prov:activity"] = ps_id_orig
                else:
                    prov_es_info["bundle"][bundle_id_orig]["activity"].update(
                        pd["bundle"][bundle_id]["activity"])
            else:
                prov_es_info["bundle"][bundle_id_orig]["activity"] = pd[
                    "bundle"][bundle_id]["activity"]
    else:
        # update software agent
        prov_es_info.setdefault("agent", {}).update(pd["agent"])

        # update wasAssociatedWith
        prov_es_info.setdefault("wasAssociatedWith",
                                {}).update(pd["wasAssociatedWith"])

        # update process step
        if "activity" in prov_es_info:
            if len(prov_es_info["activity"]) == 1:
                ps_id_orig = list(prov_es_info["activity"].keys())[0]
                prov_es_info["activity"][ps_id_orig]["prov:startTime"] = pd[
                    "activity"][ps_id]["prov:startTime"]
                prov_es_info["activity"][ps_id_orig]["prov:endTime"] = pd[
                    "activity"][ps_id]["prov:endTime"]
                prov_es_info["activity"][ps_id_orig]["hysds:job_id"] = job[
                    "job_id"]
                prov_es_info["activity"][ps_id_orig]["hysds:job_type"] = job[
                    "type"]
                prov_es_info["activity"][ps_id_orig]["hysds:job_url"] = job[
                    "job_info"]["job_url"]
                prov_es_info["activity"][ps_id_orig][
                    "hysds:mozart_url"] = app.conf.MOZART_URL
                if "prov:type" not in prov_es_info["activity"][ps_id_orig]:
                    prov_es_info["activity"][ps_id_orig]["prov:type"] = pd[
                        "activity"][ps_id]["prov:type"]

                # update wasAssociatedWith activity ids
                for waw_id in prov_es_info["wasAssociatedWith"]:
                    if (prov_es_info["wasAssociatedWith"][waw_id]
                        ["prov:activity"] == ps_id):
                        prov_es_info["wasAssociatedWith"][waw_id][
                            "prov:activity"] = ps_id_orig
            else:
                prov_es_info["activity"].update(pd["activity"])
        else:
            prov_es_info["activity"] = pd["activity"]

    # write prov
    with open(prov_es_file, "w") as f:
        json.dump(prov_es_info, f, indent=2)
Example #11
0
def get_doc_prov(j, gcis_url, refList, personList, reportList, webpageList):#organizationList, activityList):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument()
    
    dataset = requests.get(j['href']).json()
    datasetID = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ("prov:type", 'gcis:Dataset'),
        ("prov:label", j['name']),
        ("prov:location", "%s%s"%(gcis_url, j['uri'])),
        ]
    doc.entity('bibo:%s' % j['identifier'], doc_attrs)

    agent_ids = {}
    org_ids = {}
    #contributors
    if 'contributors' in dataset:
        for contributor in dataset['contributors']:
            personID = None 
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                    for i in ('first_name', 'middle_name', 'last_name')
                    if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                agent_ids[personID] = []
                doc.agent(personID, personAttrs) 
            if contributor['organization'] is not None:
                #make org
                org_attrs = [
                        ("prov:type", "gcis:organization"),
                        ("prov:label", contributor["organization"]["name"]),
                        ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                        ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                        ("gcis:country_code", contributor["organization"]["country_code"]),
                        ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                #doc.entity(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                org_ids[orgID] = True
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)

    #create actvity
    if dataset['start_time'] is not None:
        start_time = str(dataset['start_time'])
    else:
        start_time = ""
    if dataset['end_time'] is not None:
        end_time = str(dataset['end_time'])
    else:
        end_time = ""
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))])

       
    #aliases

    #instrument measurements

    """
    role_ids = {}
    agent_ids = {}
    org_ids = {}
    personID = None
    #contributors
    if 'contributors' in dataset:
        for contributor in dataset['contributors']:
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                    for i in ('first_name', 'middle_name', 'last_name')
                    if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                role_ids[personID] = contributor['role_type_identifier']

                doc.agent(personID, personAttrs)
                agent_ids[personID] = []
                #doc.wasAssociatedWith(datasetID, personID, None, None,{"prov:role": contributor['role_type_identifier']} )
            if contributor['organization'] is not None:
                org_attrs = [
                        ("prov:type", "gcis:organization"),
                        ("prov:label", contributor["organization"]["name"]),
                        ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                        ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                        ("gcis:country_code", contributor["organization"]["country_code"]),
                        ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                doc.agent(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)
    
    if dataset['start_time'] is not None:
        start_time = str(dataset['start_time'])
    else:
        start_time = ""
    if dataset['end_time'] is not None:
        end_time = str(dataset['end_time'])
    else:
        end_time = ""
    #    print j['identifier']
    #else:
    #    start_time = None
    #    end_time = None
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS[role_ids[agent_id]] })
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))])

    """




    #cited by
    if 'cited_by' in dataset:
        for citation in dataset['cited_by']:
            if 'publication' in citation:
                #pub_uri = "%s%s"%(gcis_url, citation['publication'])
                itemType = citation['publication'].split("/")[1]

                itemList = get_itemList(dump_dir, itemType)
                if any(item['uri'] == citation['publication'] for item in itemList):
                    item = next(obj for obj in itemList if obj['uri'] == citation['publication'])
                    item_id = 'bibo:%s'%item['identifier']
                    doc.wasDerivedFrom(item_id, datasetID)

    prov_json = json.loads(doc.serialize())

    return prov_json
Example #12
0
def create_prov_es_json(ctx_file, id, prod_dir, prov_file):
    """Create provenance JSON file."""

    # get abs path
    prod_dir = os.path.abspath(prod_dir)

    # get context
    with open(ctx_file) as f:
        context = json.load(f)

    # get mission char
    mis_char = MISSION_RE.search(context.get('file')).group(1)
    mis_char_lc = mis_char.lower()

    # get input url
    input_url = context.get('localize_urls', [{'url': None}])[0]['url']

    # get info
    s1_files = glob(os.path.join(prod_dir, "s1%s-*.tiff" % mis_char_lc))
    pf = "S1%s" % mis_char
    dtype = "NA"
    repo_dir = "?"
    if len(s1_files) > 0:
        match = S1_RE.search(os.path.basename(s1_files[0]))
        if match: pf, swathnum, dtype = match.groups()
        if dtype == "raw":
            dtype = "RAW"
            repo_dir = "s1%s_raw" % mis_char_lc
        elif dtype == "slc":
            dtype = "SLC"
            repo_dir = "s1%s_slc" % mis_char_lc
    platform = "eos:%s" % pf
    platform_title = "Sentinel1%s Satellite" % mis_char
    instrument = "eos:%s-SAR" % pf
    instrument_title = "%s-SAR" % pf
    level = "L0"
    version = "v1.0"
    collection = "eos:S1%s-%s-%s" % (mis_char, dtype, version)
    collection_shortname = "S1%s-%s-%s" % (mis_char, dtype, version)
    collection_label = "S1%s %s Scenes %s" % (mis_char, dtype, version)
    collection_loc = "https://aria-dst-dav.jpl.nasa.gov/repository/products/%s/%s" % (
        repo_dir, version)
    sensor = "eos:SAR"
    sensor_title = "Synthetic-aperture radar (SAR)"
    gov_org = "eos:ESA"
    gov_org_title = "European Space Agency"
    software_version = "2.0.0_201604"
    software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version
    software = "eos:ISCE-%s" % software_version
    software_location = "https://winsar.unavco.org/isce.html"
    algorithm = "eos:metadata_extraction"
    prod_dir = "file://%s%s" % (socket.getfqdn(), prod_dir)

    # put in fake start/end times so that prov:used and prov:generated
    # are properly created by the prov lib
    fake_time = datetime.utcnow().isoformat() + 'Z'
    job_id = "ingest-%s-%s" % (id, fake_time)
    bundle_id = "bundle-ingest-%s-%s" % (id, fake_time)

    doc = ProvEsDocument()
    #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id))
    bndl = None
    input_id = "hysds:%s" % get_uuid(input_url)
    input_ds = doc.granule(input_id,
                           None, [input_url], [instrument],
                           None,
                           level,
                           None,
                           label=os.path.basename(input_url),
                           bundle=bndl)
    doc.collection(collection,
                   None,
                   collection_shortname,
                   collection_label, [collection_loc], [instrument],
                   level,
                   version,
                   label=collection_label,
                   bundle=bndl)
    output_id = "hysds:%s" % get_uuid(prod_dir)
    output_ds = doc.granule(output_id,
                            None, [prod_dir], [instrument],
                            collection,
                            level,
                            version,
                            label=id,
                            bundle=bndl)
    doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl)
    doc.platform(platform, [instrument], label=platform_title, bundle=bndl)
    doc.instrument(instrument,
                   platform, [sensor], [gov_org],
                   label=instrument_title,
                   bundle=bndl)
    doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl)
    doc.software(software, [algorithm],
                 software_version,
                 label=software_title,
                 location=software_location,
                 bundle=bndl)
    doc.processStep("hysds:%s" % get_uuid(job_id),
                    fake_time,
                    fake_time, [software],
                    None,
                    None, [input_ds.identifier], [output_ds.identifier],
                    label=job_id,
                    bundle=bndl,
                    prov_type="hysds:ingest")

    with open(prov_file, 'w') as f:
        json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
Example #13
0
def create_prov_es_json(id, netsel_file, jobdesc_file, project, aria_dem_xml,
                        aria_dem_file, prod_dir, work_dir, prov_file):
    """Create provenance JSON file."""

    # put in fake start/end times so that prov:used and prov:generated
    # are properly created by the prov lib
    fake_time = datetime.utcnow().isoformat() + 'Z'
    job_id = "create_interferogram-%s" % fake_time
    bundle_id = "bundle-create_interferogram-%s" % fake_time

    # create PROV-ES doc
    doc = ProvEsDocument()
    #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id))
    bndl = None

    # input and output identifiers
    input_ids = {}
    platform_ids = {}
    instrument_ids = {}

    # full url paths
    work_url = "file://%s%s" % (socket.getfqdn(), work_dir)
    prod_url = "%s/%s" % (work_url, prod_dir)

    # add network selector file
    #netsel_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, netsel_file)),
    netsel_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, netsel_file)),
                          ["%s/%s" % (work_url, netsel_file)],
                          label=os.path.basename(netsel_file))
    input_ids[netsel_ent.identifier] = True
    
    # add job description file
    #jobdesc_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, jobdesc_file)),
    jobdesc_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, jobdesc_file)),
                           ["%s/%s" % (work_url, jobdesc_file)],
                           label=os.path.basename(jobdesc_file))
    input_ids[jobdesc_ent.identifier] = True
    
    # get list of CSK urls
    level = "L0"
    version = "v1.0"
    sensor = "eos:SAR"
    sensor_title = "Synthetic-aperture radar (SAR)"
    gov_org = "eos:ASI"
    gov_org_title = "Agenzia Spaziale Italiana"
    doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl)
    instrument = ""
    for i, url in enumerate(get_netsel_urls(netsel_file)):
        match = PLATFORM_RE.search(url)
        if not match: continue
        pf = match.group(1)
        platform = "eos:%s" % pf
        platform_title = "COSMO-SkyMed Satellite %s" % pf[-1]
        instrument = "eos:%s-SAR" % pf
        instrument_title = "%s-SAR" % pf
        input_ds = doc.product("hysds:%s" % get_uuid(url), None,
                               [url], [instrument], None, level, version,
                               label=os.path.basename(url), bundle=bndl)
        input_ids[input_ds.identifier] = True
        if platform not in platform_ids:
            doc.platform(platform, [instrument], label=platform_title,
                         bundle=bndl)
            platform_ids[platform] = True
        if instrument not in instrument_ids:
            doc.instrument(instrument, platform, [sensor], [gov_org],
                           label=instrument_title, bundle=bndl)
            doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl)
            instrument_ids[instrument] = True

    # add dem xml, file and related provenance
    srtm_platform = "eos:SpaceShuttleEndeavour"
    srtm_platform_title = "USS Endeavour"
    srtm_instrument = "eos:SRTM"
    srtm_instrument_title = "Shuttle Radar Topography Mission (SRTM)"
    srtm_sensor = "eos:radar"
    srtm_sensor_title = "radar"
    srtm_gov_org = "eos:JPL"
    srtm_gov_org_title = "Jet Propulsion Laboratory"
    doc.governingOrganization(srtm_gov_org, label=srtm_gov_org_title, bundle=bndl)
    #dem_xml_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_xml)),
    dem_xml_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_xml)),
                           ["%s/%s" % (work_url, aria_dem_xml)],
                           label=os.path.basename(aria_dem_xml))
    input_ids[dem_xml_ent.identifier] = True
    #dem_file_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_file)),
    dem_file_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_file)),
                            ["%s/%s" % (work_url, aria_dem_file)],
                            label=os.path.basename(aria_dem_file))
    input_ids[dem_file_ent.identifier] = True
    doc.platform(srtm_platform, [srtm_instrument], label=srtm_platform_title,
                 bundle=bndl)
    doc.instrument(srtm_instrument, srtm_platform, [srtm_sensor], [srtm_gov_org],
                   label=srtm_instrument_title, bundle=bndl)
    doc.sensor(srtm_sensor, srtm_instrument, label=srtm_sensor_title, bundle=bndl)
    instrument_ids[srtm_instrument] = True

    # software and algorithm
    algorithm = "eos:interferogram_generation"
    software_version = "2.0.0_201604"
    software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version
    software = "eos:ISCE-%s" % software_version
    software_location = "https://winsar.unavco.org/isce.html"
    doc.software(software, [algorithm], software_version, label=software_title,
                 location=software_location, bundle=bndl)

    # output
    int_level = "L2"
    int_version = "v1.0"
    int_collection = "eos:CSK-interferograms-%s" % int_version
    int_collection_shortname = "CSK-interferograms-%s" % int_version
    int_collection_label = "ISCE generated CSK interferograms %s" % int_version
    int_collection_loc = "https://aria-dav.jpl.nasa.gov/repository/products/interferogram/%s" % int_version
    doc.collection(int_collection, None, int_collection_shortname,
                   int_collection_label, [int_collection_loc],
                   instrument_ids.keys(), int_level, int_version,
                   label=int_collection_label, bundle=bndl)
    output_ds = doc.granule("hysds:%s" % get_uuid(prod_url), None, [prod_url], 
                            instrument_ids.keys(), int_collection, int_level,
                            int_version, label=id, bundle=bndl)

    # runtime context
    rt_ctx_id = "hysds:runtimeContext-ariamh-%s" % project
    doc.runtimeContext(rt_ctx_id, [project], label=project, bundle=bndl)

    # create process
    doc.processStep("hysds:%s" % get_uuid(job_id), fake_time, fake_time,
                    [software], None, rt_ctx_id, input_ids.keys(), 
                    [output_ds.identifier], label=job_id, bundle=bndl,
                    prov_type="hysds:create_interferogram")
     
    # write
    with open(prov_file, 'w') as f:
        json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
Example #14
0
def get_image_prov(j, gcis_url):
    """Generate PROV-ES JSON from GCIS image metadata."""

    # create doc
    doc = ProvEsDocument()
    bndl = None

    # create image, figure, chapter and report entities
    img_id = GCIS["%s" % j['uri'][1:].replace('/', '-')]
    img_title = j['title']
    img_url = None
    img_thumbnail_url = None
    for file_md in j.get('files', []):
        img_url = file_md['href']
        img_thumbnail_url = file_md['thumbnail_href']
    img_attrs = [
        ( PROV_TYPE, GCIS['Image'] ),
        ( PROV_LABEL, img_title ),
    ]
    if img_url is None:
        img_attrs.append(( PROV_LOCATION, "%s%s" % (gcis_url, j['uri']) ))
    else:
        img_attrs.append(( PROV_LOCATION, img_url ))
    if img_thumbnail_url is None:
        img_attrs.append(( HYSDS['thumbnail'], img_thumbnail_url ))
    doc.entity(img_id, img_attrs)
    reports = []
    chapters = []
    findings = []
    figures = []
    for figure in j.get('figures', []):
        report_uri = "/report/%s" % figure['report_identifier']
        chapter_uri = "/chapter/%s" % figure['chapter_identifier']
        figure_uri = "/figure/%s" % figure['identifier']

        # create report
        r = requests.get('%s%s.json' % (gcis_url, report_uri))
        r.raise_for_status()
        report = r.json()
        report_id = GCIS["%s" % report_uri[1:].replace('/', '-')]
        if report_id not in reports:
            doc.entity(report_id, [
                ( PROV_TYPE, GCIS['Report'] ),
                ( PROV_LABEL, report['title'] ),
                ( PROV_LOCATION, report['url'] ),
            ])
            reports.append(report_id)

        # create chapter
        r = requests.get('%s%s%s.json' % (gcis_url, report_uri, chapter_uri))
        if r.status_code != 200:
            print("Failed with %d code: %s" % (r.status_code, r.content))
            continue
        r.raise_for_status()
        chapter = r.json()
        chapter_id = GCIS["%s" % chapter_uri[1:].replace('/', '-')]
        if chapter_id not in chapters:
            doc.entity(chapter_id, [
                ( PROV_TYPE, GCIS['Chapter'] ),
                ( PROV_LABEL, chapter['title'] ),
                ( PROV_LOCATION, chapter['url'] ),
            ])
            chapters.append(chapter_id)
        doc.hadMember(report_id, chapter_id)
         
        # create findings
        r = requests.get('%s%s%s/finding.json' % (gcis_url, report_uri, chapter_uri))
        r.raise_for_status()
        for f in r.json():
            finding_id = GCIS["%s" % f['identifier']]
            if finding_id not in findings:
                doc.entity(finding_id, [
                    ( PROV_TYPE, GCIS['Finding'] ),
                    ( PROV_LABEL, f['identifier'] ),
                    ( PROV_LOCATION, f['href'] ),
                ])
                findings.append(finding_id)
            doc.hadMember(report_id, finding_id)
            doc.hadMember(chapter_id, finding_id)
         
        # create figure
        r = requests.get('%s%s%s%s.json' % (gcis_url, report_uri, chapter_uri, figure_uri))
        r.raise_for_status()
        figure_md = r.json()
        figure_id = GCIS["%s" % figure_uri[1:].replace('/', '-')]
        if figure_id not in figures:
            doc.entity(figure_id, [
                ( PROV_TYPE, GCIS['Figure'] ),
                ( PROV_LABEL, figure_md['title'] ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, figure_md['uri']) ),
            ])
            figures.append(figure_id)
            doc.hadMember(chapter_id, figure_id)
        doc.hadMember(figure_id, img_id)

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent 
            agent_name  = " ".join([cont['person'][i] for i in
                                   ('first_name', 'middle_name', 'last_name')
                                   if cont['person'].get(i, None) is not None])
            doc.agent(agent_id, [
                ( PROV_TYPE, GCIS["Person"] ),
                ( PROV_LABEL, agent_name ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ),
            ])
            agent_ids[agent_id] = []

        # organization
        if len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:          
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create activity
    start_time = j['create_dt']
    end_time = j['create_dt']
    for parent in j.get('parents', []):
        input_id = GCIS["%s" % parent['url'][1:].replace('/', '-')]
        input_name = parent['label']
        doc.entity(input_id, [
            ( PROV_TYPE, GCIS["Dataset"] ),
            ( PROV_LABEL, input_name ),
            ( PROV_LOCATION, "%s%s" % (gcis_url, parent['url']) ),
        ])
        # some activity uri's are null
        if parent['activity_uri'] is None:
            act_id = GCIS["derive-from-%s" % input_id]
        else:
            act_id = GCIS["%s" % parent['activity_uri'][1:].replace('/', '-')]
        attrs = []
        for agent_id in agent_ids:
            waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
            doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Contributor']})
            for org_id in agent_ids[agent_id]:
                del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))]
                doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': GCIS['worksAt']})
        for org_id in org_ids:
            waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
            doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Funder']})
        act = doc.activity(act_id, start_time, end_time, attrs)
        doc.used(act, input_id, start_time, GCIS["%s" % get_uuid("%s:%s" % (act_id, input_id))])
        doc.wasGeneratedBy(img_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (img_id, act_id))])
           
    # serialize
    prov_json = json.loads(doc.serialize())

    # for hadMember relations, add prov:type
    for hm_id in prov_json.get('hadMember', {}):
        hm = prov_json['hadMember'][hm_id]
        col = hm['prov:collection'] 
        ent = hm['prov:entity'] 
        if col in reports and ent in chapters:
            hm['prov:type'] = GCIS['hasChapter']
        elif col in chapters and ent in figures:
            hm['prov:type'] = GCIS['hasFigure']
        elif col in figures and ent == img_id:
            hm['prov:type'] = GCIS['hasImage']

    #print(json.dumps(prov_json, indent=2))

    return prov_json
Example #15
0
def get_doc_prov(j, gcis_url, refList, journalList, organizationList, personList, dump_dir):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    gcis_ns = "https://gcis-search-stage.jpl.net:3000/gcis.owl#"
    doc = ProvEsDocument()
    bndl = None
    
    article = requests.get(j['href']).json()

    journalID = None

    #make journal
    if any(journal['identifier'] == article['journal_identifier'] for journal in journalList):
        journal = next(jour for jour in journalList if jour['identifier'] == article['journal_identifier'])
        journalAttrs = [
                ("prov:type", 'gcis:Journal'),
                ("prov:label", journal['title']),
                ("prov:location", "%s%s"%(gcis_url,journal['uri'])),
                ("gcis:online_issn", journal['online_issn']),
                ("gcis:print_issn", journal['print_issn']),
                ("gcis:publisher", journal['publisher']),
                ]
        journalID = 'bibo:%s'%journal['identifier']
        doc.entity(journalID, journalAttrs)

    #get organization/publisher if any
    if journal['publisher'] is not None:
        if any(organization['name'] == journal['publisher'] for organization in organizationList):
            organization = next(org for org in organizationList if org['name'] == journal['publisher'])
            org_attrs = [
                    ("prov:type", 'gcis:organization'),
                    ("prov:label", organization['name']),
                    ("prov:location", "%s%s"%(gcis_url,organization['uri'])),
                    ("gcis:organization_type_identifier", organization['organization_type_identifier']),
                    ("gcis:country_code", organization['country_code']),
                    ]
            org_id = 'bibo:%s'%organization['identifier']
            doc.entity(org_id, org_attrs)

            doc.governingOrganization(org_id, organization['name'])

    #make article
    articleAttrs = [
        ("prov:type", 'gcis:Article'),
        ("prov:label", article['title']),
        ("prov:location","%s%s"%(gcis_url, article['uri'])),
        ("dcterms:isPartOf", journalID),
    ]
    articleID = 'bibo:%s'%article['identifier']
    doc.entity(articleID, articleAttrs)

    #link journal to article
    if journalID is not None:
        doc.hadMember(journalID, articleID)

    agent_ids = {}
    org_ids = {}
    #contributors
    if 'contributors' in article:
        for contributor in article['contributors']:
            personID = None 
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                                for i in ('first_name', 'middle_name', 'last_name')
                                if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                agent_ids[personID] = []
                doc.agent(personID, personAttrs) 
            if contributor['organization'] is not None:
                #make org
                org_attrs = [
                    ("prov:type", "gcis:organization"),
                    ("prov:label", contributor["organization"]["name"]),
                    ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                    ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                    ("gcis:country_code", contributor["organization"]["country_code"]),
                ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                #doc.entity(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                org_ids[orgID] = True
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)

    #create actvity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(articleID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(articleID, act_id))])


    #cited by?
    if 'cited_by' in article:
        for citation in article['cited_by']:
            if 'publication' in citation:
                #pub_uri = "%s%s"%(gcis_url, citation['publication'])
                itemType = citation['publication'].split("/")[1]
                
                itemList = get_itemList(dump_dir, itemType)
                if any(item['uri'] == citation['publication'] for item in itemList):
                    item = next(obj for obj in itemList if obj['uri'] == citation['publication'])
                    item_id = 'bibo:%s'%item['identifier']
                    doc.wasDerivedFrom(item_id, articleID)
                    print articleID
    prov_json = json.loads(doc.serialize())

    return prov_json