def get_doc_prov(j, gcis_url, refList): """Generate PROV-ES JSON from GCIS doc metadata.""" doc = ProvEsDocument() name = " ".join(j[i] for i in ('first_name', 'middle_name', 'last_name') if j.get(i, None) is not None) doc_attrs =[("prov:type", 'gcis:Person'), ("prov:label", name),#j['first_name']), ("prov:location", "%s%s"%(gcis_url,j['uri'])), ("gcis:id", j['id']), ("gcis:orcid", j["orcid"]), #("prov:wasAttributedTo, contributors), ] doc.agent('bibo:%s' % j['id'], doc_attrs) del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(j['id'], None, None))] doc.delegation('bibo:%s'%j['id'], None, None, del_id, None) #for org_id in agent_ids[agent_id]: # del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] #doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) prov_json = json.loads(doc.serialize()) return prov_json
def fix_hadMember_ids(prov_es_json): """Fix the id's of hadMember relationships.""" hm_ids = prov_es_json.get('hadMember', {}).keys() for id in hm_ids: hm = copy.deepcopy(prov_es_json['hadMember'][id]) new_id = "hysds:%s" % get_uuid("%s:%s" % (hm['prov:collection'], hm['prov:entity'])) prov_es_json['hadMember'][new_id] = hm del prov_es_json['hadMember'][id]
def fix_hadMember_ids(prov_es_json): """Fix the id's of hadMember relationships.""" hm_ids = prov_es_json.get('hadMember', {}).keys() for id in hm_ids: hm = copy.deepcopy(prov_es_json['hadMember'][id]) new_id = "hysds:%s" % get_uuid( "%s:%s" % (hm['prov:collection'], hm['prov:entity'])) prov_es_json['hadMember'][new_id] = hm del prov_es_json['hadMember'][id]
def log_publish_prov_es(prov_es_info, prov_es_file, prod_path, pub_urls, prod_metrics, objectid): """Log publish step in PROV-ES document.""" # create PROV-ES doc doc = ProvEsDocument(namespaces=prov_es_info['prefix']) # get bundle #bndl = doc.bundle(bundle_id) bndl = None # add input entity execute_node = socket.getfqdn() prod_url = "file://%s%s" % (execute_node, prod_path) input_id = "hysds:%s" % get_uuid(prod_url) input_ent = doc.granule(input_id, None, [prod_url], [], None, None, None, label=os.path.basename(prod_url), bundle=bndl) # add output entity output_id = "hysds:%s" % get_uuid(pub_urls[0]) output_ent = doc.product(output_id, None, [pub_urls[0]], [], None, None, None, label=objectid, bundle=bndl) # software and algorithm algorithm = "eos:product_publishing" software_version = hysds.__version__ software_title = "%s v%s" % (hysds.__description__, software_version) software = "eos:HySDS-%s" % software_version software_location = hysds.__url__ doc.software(software, [algorithm], software_version, label=software_title, location=software_location, bundle=bndl) # create sofware agent pid = os.getpid() sa_label = "hysds:publish_dataset/%s/%d/%s" % (execute_node, pid, prod_metrics['time_start']) sa_id = "hysds:%s" % get_uuid(sa_label) doc.softwareAgent(sa_id, str(pid), execute_node, role="invoked", label=sa_label, bundle=bndl) # create processStep job_id = "publish_dataset-%s" % os.path.basename(prod_path) doc.processStep("hysds:%s" % get_uuid(job_id), prod_metrics['time_start'], prod_metrics['time_end'], [software], sa_id, None, [input_id], [output_id], label=job_id, bundle=bndl, prov_type="hysds:publish_dataset") # get json pd = json.loads(doc.serialize()) # update input entity orig_ent = prov_es_info.get('entity', {}).get(input_id, {}) pd['entity'][input_id].update(orig_ent) # update output entity for attr in orig_ent: if attr in ('prov:location', 'prov:label', 'prov:type'): continue pd['entity'][output_id][attr] = orig_ent[attr] # write prov with open(prov_es_file, 'w') as f: json.dump(pd, f, indent=2)
def log_prov_es(job, prov_es_info, prov_es_file): """Log PROV-ES document. Create temp PROV-ES document to populate attributes that only the worker has access to (e.g. PID).""" # create PROV-ES doc to generate attributes that only verdi know ps_id = "hysds:%s" % get_uuid(job['job_id']) bundle_id = "hysds:%s" % get_uuid('bundle-%s' % job['job_id']) doc = ProvEsDocument() # get bundle #bndl = doc.bundle(bundle_id) bndl = None # create sofware agent sa_label = "hysds:pge_wrapper/%s/%d/%s" % (job['job_info']['execute_node'], job['job_info']['pid'], datetime.utcnow().isoformat()) sa_id = "hysds:%s" % get_uuid(sa_label) doc.softwareAgent(sa_id, str(job['job_info']['pid']), job['job_info']['execute_node'], role=job.get('username', None), label=sa_label, bundle=bndl) # create processStep doc.processStep(ps_id, job['job_info']['cmd_start'], job['job_info']['cmd_end'], [], sa_id, None, [], [], bundle=bndl, prov_type="hysds:%s" % job['type']) # get json pd = json.loads(doc.serialize()) # update software agent and process step if 'bundle' in prov_es_info: if len(prov_es_info['bundle']) == 1: bundle_id_orig = list(prov_es_info['bundle'].keys())[0] # update software agent prov_es_info['bundle'][bundle_id_orig].setdefault( 'agent', {}).update(pd['bundle'][bundle_id]['agent']) # update wasAssociatedWith prov_es_info['bundle'][bundle_id_orig].setdefault( 'wasAssociatedWith', {}).update(pd['bundle'][bundle_id]['wasAssociatedWith']) # update activity if 'activity' in prov_es_info['bundle'][bundle_id_orig]: if len(prov_es_info['bundle'][bundle_id_orig] ['activity']) == 1: ps_id_orig = list(prov_es_info['bundle'][bundle_id_orig] ['activity'].keys())[0] prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['prov:startTime'] = pd['bundle'][ bundle_id]['activity'][ps_id]['prov:startTime'] prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['prov:endTime'] = pd['bundle'][bundle_id][ 'activity'][ps_id]['prov:endTime'] prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['hysds:job_id'] = job['job_id'] prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['hysds:job_type'] = job['type'] prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['hysds:job_url'] = job['job_info'][ 'job_url'] prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['hysds:mozart_url'] = app.conf.MOZART_URL if 'prov:type' not in prov_es_info['bundle'][ bundle_id_orig]['activity'][ps_id_orig]: prov_es_info['bundle'][bundle_id_orig]['activity'][ ps_id_orig]['prov:type'] = pd['bundle'][bundle_id][ 'activity'][ps_id]['prov:type'] # update wasAssociatedWith activity ids for waw_id in prov_es_info['bundle'][bundle_id_orig][ 'wasAssociatedWith']: if prov_es_info['bundle'][bundle_id_orig][ 'wasAssociatedWith'][waw_id][ 'prov:activity'] == ps_id: prov_es_info['bundle'][bundle_id_orig][ 'wasAssociatedWith'][waw_id][ 'prov:activity'] = ps_id_orig else: prov_es_info['bundle'][bundle_id_orig]['activity'].update( pd['bundle'][bundle_id]['activity']) else: prov_es_info['bundle'][bundle_id_orig]['activity'] = pd[ 'bundle'][bundle_id]['activity'] else: # update software agent prov_es_info.setdefault('agent', {}).update(pd['agent']) # update wasAssociatedWith prov_es_info.setdefault('wasAssociatedWith', {}).update(pd['wasAssociatedWith']) # update process step if 'activity' in prov_es_info: if len(prov_es_info['activity']) == 1: ps_id_orig = list(prov_es_info['activity'].keys())[0] prov_es_info['activity'][ps_id_orig]['prov:startTime'] = pd[ 'activity'][ps_id]['prov:startTime'] prov_es_info['activity'][ps_id_orig]['prov:endTime'] = pd[ 'activity'][ps_id]['prov:endTime'] prov_es_info['activity'][ps_id_orig]['hysds:job_id'] = job[ 'job_id'] prov_es_info['activity'][ps_id_orig]['hysds:job_type'] = job[ 'type'] prov_es_info['activity'][ps_id_orig]['hysds:job_url'] = job[ 'job_info']['job_url'] prov_es_info['activity'][ps_id_orig][ 'hysds:mozart_url'] = app.conf.MOZART_URL if 'prov:type' not in prov_es_info['activity'][ps_id_orig]: prov_es_info['activity'][ps_id_orig]['prov:type'] = pd[ 'activity'][ps_id]['prov:type'] # update wasAssociatedWith activity ids for waw_id in prov_es_info['wasAssociatedWith']: if prov_es_info['wasAssociatedWith'][waw_id][ 'prov:activity'] == ps_id: prov_es_info['wasAssociatedWith'][waw_id][ 'prov:activity'] = ps_id_orig else: prov_es_info['activity'].update(pd['activity']) else: prov_es_info['activity'] = pd['activity'] # write prov with open(prov_es_file, 'w') as f: json.dump(prov_es_info, f, indent=2)
def create_prov_es_json(id, url, prod_dir, prov_file): """Create provenance JSON file.""" # get info csk_files = glob(os.path.join(prod_dir, "CSKS*")) pf = "CSKS?" dtype = "NA" repo_dir = "?" if len(csk_files) > 0: match = CSK_RE.search(os.path.basename(csk_files[0])) if match: pf, dtype = match.groups() if dtype == "RAW": dtype = "RAW_B" repo_dir = "csk_rawb" elif dtype == "SCS": dtype = "SCS_B" repo_dir = "csk_scsb" platform = "eos:%s" % pf platform_title = "COSMO-SkyMed Satellite %s" % pf[-1] instrument = "eos:%s-SAR" % pf instrument_title = "%s-SAR" % pf level = "L0" version = "v1.0" collection = "eos:CSK-%s-%s" % (dtype, version) collection_shortname = "CSK-%s-%s" % (dtype, version) collection_label = "CSK %s Scenes %s" % (dtype, version) collection_loc = "https://aria-dav.jpl.nasa.gov/repository/products/%s/%s" % ( repo_dir, version) sensor = "eos:SAR" sensor_title = "Synthetic-aperture radar (SAR)" gov_org = "eos:ASI" gov_org_title = "Agenzia Spaziale Italiana" software_version = "2.0.0_201604" software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version software = "eos:ISCE-%s" % software_version software_location = "https://winsar.unavco.org/isce.html" algorithm = "eos:metadata_extraction" prod_dir = "file://%s%s" % (socket.getfqdn(), prod_dir) # put in fake start/end times so that prov:used and prov:generated # are properly created by the prov lib fake_time = datetime.utcnow().isoformat() + 'Z' job_id = "ingest-%s-%s" % (id, fake_time) bundle_id = "bundle-ingest-%s-%s" % (id, fake_time) doc = ProvEsDocument() #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id)) bndl = None input_id = "hysds:%s" % get_uuid(url) input_ds = doc.granule(input_id, None, [url], [instrument], None, level, None, label=os.path.basename(url), bundle=bndl) doc.collection(collection, None, collection_shortname, collection_label, [collection_loc], [instrument], level, version, label=collection_label, bundle=bndl) output_id = "hysds:%s" % get_uuid(prod_dir) output_ds = doc.granule(output_id, None, [prod_dir], [instrument], collection, level, version, label=id, bundle=bndl) doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl) doc.platform(platform, [instrument], label=platform_title, bundle=bndl) doc.instrument(instrument, platform, [sensor], [gov_org], label=instrument_title, bundle=bndl) doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl) doc.software(software, [algorithm], software_version, label=software_title, location=software_location, bundle=bndl) doc.processStep("hysds:%s" % get_uuid(job_id), fake_time, fake_time, [software], None, None, [input_ds.identifier], [output_ds.identifier], label=job_id, bundle=bndl, prov_type="hysds:ingest") with open(prov_file, 'w') as f: json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
def get_doc_prov(j, gcis_url): """Generate PROV-ES JSON from GCIS doc metadata.""" # create doc gcis_ns = "http://data.globalchange.gov/gcis.owl#" doc = ProvEsDocument(namespaces={ "gcis": gcis_ns, "bibo": "http://purl.org/ontology/bibo/" }) bndl = None # create journal r = requests.get("%s/journal/%s.json" % (gcis_url, j['journal_identifier']), params={'all': 1}, verify=False) r.raise_for_status() journal_md = r.json() doc_attrs = [ ("prov:type", 'gcis:Journal'), ("prov:label", j['title']), ] journal_id = GCIS[j['journal_identifier']] if journal_id not in journal_ids: if journal_md.get('url', None) is not None: doc_attrs.append(("prov:location", journal_md['url'])) if journal_md.get('online_issn', None) is not None: doc_attrs.append(("gcis:online_issn", journal_md['online_issn'])) if journal_md.get('print_issn', None) is not None: doc_attrs.append(("gcis:print_issn", journal_md['print_issn'])) doc.entity(journal_id, doc_attrs) journal_ids[journal_id] = True # create agents or organizations agent_ids = {} org_ids = {} for cont in j.get('contributors', []): # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')] # create person if len(cont['person']) > 0: # agent agent_name = " ".join([ cont['person'][i] for i in ('first_name', 'middle_name', 'last_name') if cont['person'].get(i, None) is not None ]) doc.agent(agent_id, [ (PROV_TYPE, GCIS["Person"]), (PROV_LABEL, agent_name), (PROV_LOCATION, "%s%s" % (gcis_url, cont['uri'])), ]) agent_ids[agent_id] = [] # organization if cont['organization'] is not None and len(cont['organization']) > 0: org = cont['organization'] org_id = GCIS["%s" % cont['organization']['identifier']] if org_id not in org_ids: doc.governingOrganization(org_id, cont['organization']['name']) org_ids[org_id] = True if agent_id in agent_ids: agent_ids[agent_id].append(org_id) # create article article_id = 'bibo:%s' % j['identifier'] doc_attrs = [ ("prov:type", 'gcis:Article'), ("prov:label", j['title']), ("dcterms:isPartOf", journal_id), ] if j.get('doi', "") == "": doc_attrs.append(("bibo:doi", j['doi'])) doc.entity(article_id, doc_attrs) # link doc.hadMember(journal_id, article_id) # create activity if isinstance(j['year'], int): start_time = str(j['year']) end_time = str(j['year']) else: start_time = None end_time = None act_id = GCIS["generate-%s" % j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Author']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': 'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(article_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (article_id, act_id))]) # serialize prov_json = json.loads(doc.serialize()) return prov_json
def create_prov_es_json(id, project, master_orbit_file, slave_orbit_file, aria_dem_xml, aria_dem_file, work_dir, prov_file): """Create provenance JSON file.""" # get abs paths work_dir = os.path.abspath(work_dir) prod_dir = os.path.join(work_dir, id) # get context ctx_file = os.path.join(prod_dir, "%s.context.json" % id) with open(ctx_file) as f: context = json.load(f) # put in fake start/end times so that prov:used and prov:generated # are properly created by the prov lib fake_time = datetime.utcnow().isoformat() + 'Z' job_id = "create_interferogram-%s" % fake_time bundle_id = "bundle-create_interferogram-%s" % fake_time # create PROV-ES doc doc = ProvEsDocument() #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id)) bndl = None # input and output identifiers input_ids = {} platform_ids = {} instrument_ids = {} # full url paths work_url = "file://%s%s" % (socket.getfqdn(), work_dir) prod_url = "%s/%s" % (work_url, id) # add sentinel.ini file ini_ent = doc.file("hysds:%s" % get_uuid("%s/sentinel.ini" % work_url), ["%s/sentinel.ini" % work_url], label="sentinel.ini") input_ids[ini_ent.identifier] = True # add orbit files master_orbit_ent = doc.file( "hysds:%s" % get_uuid("%s/%s" % (work_url, master_orbit_file)), ["%s/%s" % (work_url, master_orbit_file)], label=os.path.basename(master_orbit_file)) input_ids[master_orbit_ent.identifier] = True slave_orbit_ent = doc.file( "hysds:%s" % get_uuid("%s/%s" % (work_url, slave_orbit_file)), ["%s/%s" % (work_url, slave_orbit_file)], label=os.path.basename(slave_orbit_file)) input_ids[slave_orbit_ent.identifier] = True # get list of S1A urls level = "L0" version = "v1.0" sensor = "eos:SAR" sensor_title = "Synthetic-aperture radar (SAR)" gov_org = "eos:ESA" gov_org_title = "European Space Agency" doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl) instrument = "" for i, url in enumerate( [context.get('master_zip_url', ''), context.get('slave_zip_url', '')]): match = PLATFORM_RE.search(url) if not match: continue pf = match.group(1) platform = "eos:%s" % pf platform_title = "Sentinel1A Satellite" instrument = "eos:%s-SAR" % pf instrument_title = "%s-SAR" % pf input_ds = doc.product("hysds:%s" % get_uuid(url), None, [url], [instrument], None, level, None, label=os.path.basename(url), bundle=bndl) input_ids[input_ds.identifier] = True if platform not in platform_ids: doc.platform(platform, [instrument], label=platform_title, bundle=bndl) platform_ids[platform] = True if instrument not in instrument_ids: doc.instrument(instrument, platform, [sensor], [gov_org], label=instrument_title, bundle=bndl) doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl) instrument_ids[instrument] = True # add dem xml, file and related provenance srtm_platform = "eos:SpaceShuttleEndeavour" srtm_platform_title = "USS Endeavour" srtm_instrument = "eos:SRTM" srtm_instrument_title = "Shuttle Radar Topography Mission (SRTM)" srtm_sensor = "eos:radar" srtm_sensor_title = "radar" srtm_gov_org = "eos:JPL" srtm_gov_org_title = "Jet Propulsion Laboratory" doc.governingOrganization(srtm_gov_org, label=srtm_gov_org_title, bundle=bndl) dem_xml_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_xml)), ["%s/%s" % (work_url, aria_dem_xml)], label=os.path.basename(aria_dem_xml)) input_ids[dem_xml_ent.identifier] = True dem_file_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_file)), ["%s/%s" % (work_url, aria_dem_file)], label=os.path.basename(aria_dem_file)) input_ids[dem_file_ent.identifier] = True doc.platform(srtm_platform, [srtm_instrument], label=srtm_platform_title, bundle=bndl) doc.instrument(srtm_instrument, srtm_platform, [srtm_sensor], [srtm_gov_org], label=srtm_instrument_title, bundle=bndl) doc.sensor(srtm_sensor, srtm_instrument, label=srtm_sensor_title, bundle=bndl) instrument_ids[srtm_instrument] = True # software and algorithm algorithm = "eos:interferogram_generation" software_version = "2.0.0_201604" software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version software = "eos:ISCE-%s" % software_version software_location = "https://winsar.unavco.org/isce.html" doc.software(software, [algorithm], software_version, label=software_title, location=software_location, bundle=bndl) # output int_level = "L2" int_version = "v1.0" int_collection = "eos:S1A-interferograms-%s" % int_version int_collection_shortname = "S1A-interferograms-%s" % int_version int_collection_label = "ISCE generated S1A interferograms %s" % int_version int_collection_loc = "https://aria-dst-dav.jpl.nasa.gov/products/s1a_ifg/%s" % int_version doc.collection(int_collection, None, int_collection_shortname, int_collection_label, [int_collection_loc], instrument_ids.keys(), int_level, int_version, label=int_collection_label, bundle=bndl) output_ds = doc.granule("hysds:%s" % get_uuid(prod_url), None, [prod_url], instrument_ids.keys(), int_collection, int_level, int_version, label=id, bundle=bndl) # runtime context rt_ctx_id = "hysds:runtimeContext-sentinel_ifg-%s" % project doc.runtimeContext(rt_ctx_id, [project], label=project, bundle=bndl) # create process doc.processStep("hysds:%s" % get_uuid(job_id), fake_time, fake_time, [software], None, rt_ctx_id, input_ids.keys(), [output_ds.identifier], label=job_id, bundle=bndl, prov_type="hysds:create_interferogram") # write with open(prov_file, 'w') as f: json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
def get_doc_prov(j, gcis_url): """Generate PROV-ES JSON from GCIS doc metadata.""" # create doc gcis_ns = "http://data.globalchange.gov/gcis.owl#" doc = ProvEsDocument(namespaces={ "gcis": gcis_ns, "bibo": "http://purl.org/ontology/bibo/" }) bndl = None # create journal r = requests.get("%s/journal/%s.json" % (gcis_url, j['journal_identifier']), params={ 'all': 1 }, verify=False) r.raise_for_status() journal_md = r.json() doc_attrs = [ ( "prov:type", 'gcis:Journal' ), ( "prov:label", j['title'] ), ] journal_id = GCIS[j['journal_identifier']] if journal_id not in journal_ids: if journal_md.get('url', None) is not None: doc_attrs.append( ("prov:location", journal_md['url'] ) ) if journal_md.get('online_issn', None) is not None: doc_attrs.append( ("gcis:online_issn", journal_md['online_issn'] ) ) if journal_md.get('print_issn', None) is not None: doc_attrs.append( ("gcis:print_issn", journal_md['print_issn'] ) ) doc.entity(journal_id, doc_attrs) journal_ids[journal_id] = True # create agents or organizations agent_ids = {} org_ids = {} for cont in j.get('contributors', []): # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')] # create person if len(cont['person']) > 0: # agent agent_name = " ".join([cont['person'][i] for i in ('first_name', 'middle_name', 'last_name') if cont['person'].get(i, None) is not None]) doc.agent(agent_id, [ ( PROV_TYPE, GCIS["Person"] ), ( PROV_LABEL, agent_name ), ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ), ]) agent_ids[agent_id] = [] # organization if cont['organization'] is not None and len(cont['organization']) > 0: org = cont['organization'] org_id = GCIS["%s" % cont['organization']['identifier']] if org_id not in org_ids: doc.governingOrganization(org_id, cont['organization']['name']) org_ids[org_id] = True if agent_id in agent_ids: agent_ids[agent_id].append(org_id) # create article article_id = 'bibo:%s' % j['identifier'] doc_attrs = [ ( "prov:type", 'gcis:Article' ), ( "prov:label", j['title'] ), ( "dcterms:isPartOf", journal_id ), ] if j.get('doi', "") == "": doc_attrs.append( ("bibo:doi", j['doi'] ) ) doc.entity(article_id, doc_attrs) # link doc.hadMember(journal_id, article_id) # create activity if isinstance(j['year'], int): start_time = str(j['year']) end_time = str(j['year']) else: start_time = None end_time = None act_id = GCIS["generate-%s" % j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Author']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': 'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(article_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (article_id, act_id))]) # serialize prov_json = json.loads(doc.serialize()) return prov_json
def log_prov_es(job, prov_es_info, prov_es_file): """Log PROV-ES document. Create temp PROV-ES document to populate attributes that only the worker has access to (e.g. PID).""" # create PROV-ES doc to generate attributes that only verdi know ps_id = "hysds:%s" % get_uuid(job["job_id"]) bundle_id = "hysds:%s" % get_uuid("bundle-%s" % job["job_id"]) doc = ProvEsDocument() # get bundle # bndl = doc.bundle(bundle_id) bndl = None # create sofware agent sa_label = "hysds:pge_wrapper/%s/%d/%s" % ( job["job_info"]["execute_node"], job["job_info"]["pid"], datetime.utcnow().isoformat(), ) sa_id = "hysds:%s" % get_uuid(sa_label) doc.softwareAgent( sa_id, str(job["job_info"]["pid"]), job["job_info"]["execute_node"], role=job.get("username", None), label=sa_label, bundle=bndl, ) # create processStep doc.processStep( ps_id, job["job_info"]["cmd_start"], job["job_info"]["cmd_end"], [], sa_id, None, [], [], bundle=bndl, prov_type="hysds:%s" % job["type"], ) # get json pd = json.loads(doc.serialize()) # update software agent and process step if "bundle" in prov_es_info: if len(prov_es_info["bundle"]) == 1: bundle_id_orig = list(prov_es_info["bundle"].keys())[0] # update software agent prov_es_info["bundle"][bundle_id_orig].setdefault( "agent", {}).update(pd["bundle"][bundle_id]["agent"]) # update wasAssociatedWith prov_es_info["bundle"][bundle_id_orig].setdefault( "wasAssociatedWith", {}).update(pd["bundle"][bundle_id]["wasAssociatedWith"]) # update activity if "activity" in prov_es_info["bundle"][bundle_id_orig]: if len(prov_es_info["bundle"][bundle_id_orig] ["activity"]) == 1: ps_id_orig = list(prov_es_info["bundle"][bundle_id_orig] ["activity"].keys())[0] prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["prov:startTime"] = pd["bundle"][ bundle_id]["activity"][ps_id]["prov:startTime"] prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["prov:endTime"] = pd["bundle"][bundle_id][ "activity"][ps_id]["prov:endTime"] prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["hysds:job_id"] = job["job_id"] prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["hysds:job_type"] = job["type"] prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["hysds:job_url"] = job["job_info"][ "job_url"] prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["hysds:mozart_url"] = app.conf.MOZART_URL if ("prov:type" not in prov_es_info["bundle"] [bundle_id_orig]["activity"][ps_id_orig]): prov_es_info["bundle"][bundle_id_orig]["activity"][ ps_id_orig]["prov:type"] = pd["bundle"][bundle_id][ "activity"][ps_id]["prov:type"] # update wasAssociatedWith activity ids for waw_id in prov_es_info["bundle"][bundle_id_orig][ "wasAssociatedWith"]: if (prov_es_info["bundle"][bundle_id_orig] ["wasAssociatedWith"][waw_id]["prov:activity"] == ps_id): prov_es_info["bundle"][bundle_id_orig][ "wasAssociatedWith"][waw_id][ "prov:activity"] = ps_id_orig else: prov_es_info["bundle"][bundle_id_orig]["activity"].update( pd["bundle"][bundle_id]["activity"]) else: prov_es_info["bundle"][bundle_id_orig]["activity"] = pd[ "bundle"][bundle_id]["activity"] else: # update software agent prov_es_info.setdefault("agent", {}).update(pd["agent"]) # update wasAssociatedWith prov_es_info.setdefault("wasAssociatedWith", {}).update(pd["wasAssociatedWith"]) # update process step if "activity" in prov_es_info: if len(prov_es_info["activity"]) == 1: ps_id_orig = list(prov_es_info["activity"].keys())[0] prov_es_info["activity"][ps_id_orig]["prov:startTime"] = pd[ "activity"][ps_id]["prov:startTime"] prov_es_info["activity"][ps_id_orig]["prov:endTime"] = pd[ "activity"][ps_id]["prov:endTime"] prov_es_info["activity"][ps_id_orig]["hysds:job_id"] = job[ "job_id"] prov_es_info["activity"][ps_id_orig]["hysds:job_type"] = job[ "type"] prov_es_info["activity"][ps_id_orig]["hysds:job_url"] = job[ "job_info"]["job_url"] prov_es_info["activity"][ps_id_orig][ "hysds:mozart_url"] = app.conf.MOZART_URL if "prov:type" not in prov_es_info["activity"][ps_id_orig]: prov_es_info["activity"][ps_id_orig]["prov:type"] = pd[ "activity"][ps_id]["prov:type"] # update wasAssociatedWith activity ids for waw_id in prov_es_info["wasAssociatedWith"]: if (prov_es_info["wasAssociatedWith"][waw_id] ["prov:activity"] == ps_id): prov_es_info["wasAssociatedWith"][waw_id][ "prov:activity"] = ps_id_orig else: prov_es_info["activity"].update(pd["activity"]) else: prov_es_info["activity"] = pd["activity"] # write prov with open(prov_es_file, "w") as f: json.dump(prov_es_info, f, indent=2)
def get_doc_prov(j, gcis_url, refList, personList, reportList, webpageList):#organizationList, activityList): """Generate PROV-ES JSON from GCIS doc metadata.""" gcis_ns = "http://data.globalchange.gov/gcis.owl#" doc = ProvEsDocument() dataset = requests.get(j['href']).json() datasetID = 'bibo:%s' % j['identifier'] doc_attrs = [ ("prov:type", 'gcis:Dataset'), ("prov:label", j['name']), ("prov:location", "%s%s"%(gcis_url, j['uri'])), ] doc.entity('bibo:%s' % j['identifier'], doc_attrs) agent_ids = {} org_ids = {} #contributors if 'contributors' in dataset: for contributor in dataset['contributors']: personID = None if contributor['person_uri'] is not None: name = " ".join([contributor['person'][i] for i in ('first_name', 'middle_name', 'last_name') if contributor['person'].get(i, None) is not None]) personAttrs = [ ("prov:type", 'gcis:Person'), ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])), ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])), ("gcis:id", str(contributor['person_id'])), ("gcis:orcid", contributor['person']['orcid']) ] personID = 'bibo:%s'%contributor['person_id'] agent_ids[personID] = [] doc.agent(personID, personAttrs) if contributor['organization'] is not None: #make org org_attrs = [ ("prov:type", "gcis:organization"), ("prov:label", contributor["organization"]["name"]), ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])), ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]), ("gcis:country_code", contributor["organization"]["country_code"]), ] orgID = 'bibo:%s'%contributor['organization']['identifier'] #doc.entity(orgID, org_attrs) doc.governingOrganization(orgID, contributor['organization']['name']) org_ids[orgID] = True if personID in agent_ids: agent_ids[personID].append(orgID) #create actvity if dataset['start_time'] is not None: start_time = str(dataset['start_time']) else: start_time = "" if dataset['end_time'] is not None: end_time = str(dataset['end_time']) else: end_time = "" act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))]) #aliases #instrument measurements """ role_ids = {} agent_ids = {} org_ids = {} personID = None #contributors if 'contributors' in dataset: for contributor in dataset['contributors']: if contributor['person_uri'] is not None: name = " ".join([contributor['person'][i] for i in ('first_name', 'middle_name', 'last_name') if contributor['person'].get(i, None) is not None]) personAttrs = [ ("prov:type", 'gcis:Person'), ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])), ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])), ("gcis:id", str(contributor['person_id'])), ("gcis:orcid", contributor['person']['orcid']) ] personID = 'bibo:%s'%contributor['person_id'] role_ids[personID] = contributor['role_type_identifier'] doc.agent(personID, personAttrs) agent_ids[personID] = [] #doc.wasAssociatedWith(datasetID, personID, None, None,{"prov:role": contributor['role_type_identifier']} ) if contributor['organization'] is not None: org_attrs = [ ("prov:type", "gcis:organization"), ("prov:label", contributor["organization"]["name"]), ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])), ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]), ("gcis:country_code", contributor["organization"]["country_code"]), ] orgID = 'bibo:%s'%contributor['organization']['identifier'] doc.agent(orgID, org_attrs) doc.governingOrganization(orgID, contributor['organization']['name']) if personID in agent_ids: agent_ids[personID].append(orgID) if dataset['start_time'] is not None: start_time = str(dataset['start_time']) else: start_time = "" if dataset['end_time'] is not None: end_time = str(dataset['end_time']) else: end_time = "" # print j['identifier'] #else: # start_time = None # end_time = None act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS[role_ids[agent_id]] }) for org_id in agent_ids[agent_id]: del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))]) """ #cited by if 'cited_by' in dataset: for citation in dataset['cited_by']: if 'publication' in citation: #pub_uri = "%s%s"%(gcis_url, citation['publication']) itemType = citation['publication'].split("/")[1] itemList = get_itemList(dump_dir, itemType) if any(item['uri'] == citation['publication'] for item in itemList): item = next(obj for obj in itemList if obj['uri'] == citation['publication']) item_id = 'bibo:%s'%item['identifier'] doc.wasDerivedFrom(item_id, datasetID) prov_json = json.loads(doc.serialize()) return prov_json
def create_prov_es_json(ctx_file, id, prod_dir, prov_file): """Create provenance JSON file.""" # get abs path prod_dir = os.path.abspath(prod_dir) # get context with open(ctx_file) as f: context = json.load(f) # get mission char mis_char = MISSION_RE.search(context.get('file')).group(1) mis_char_lc = mis_char.lower() # get input url input_url = context.get('localize_urls', [{'url': None}])[0]['url'] # get info s1_files = glob(os.path.join(prod_dir, "s1%s-*.tiff" % mis_char_lc)) pf = "S1%s" % mis_char dtype = "NA" repo_dir = "?" if len(s1_files) > 0: match = S1_RE.search(os.path.basename(s1_files[0])) if match: pf, swathnum, dtype = match.groups() if dtype == "raw": dtype = "RAW" repo_dir = "s1%s_raw" % mis_char_lc elif dtype == "slc": dtype = "SLC" repo_dir = "s1%s_slc" % mis_char_lc platform = "eos:%s" % pf platform_title = "Sentinel1%s Satellite" % mis_char instrument = "eos:%s-SAR" % pf instrument_title = "%s-SAR" % pf level = "L0" version = "v1.0" collection = "eos:S1%s-%s-%s" % (mis_char, dtype, version) collection_shortname = "S1%s-%s-%s" % (mis_char, dtype, version) collection_label = "S1%s %s Scenes %s" % (mis_char, dtype, version) collection_loc = "https://aria-dst-dav.jpl.nasa.gov/repository/products/%s/%s" % ( repo_dir, version) sensor = "eos:SAR" sensor_title = "Synthetic-aperture radar (SAR)" gov_org = "eos:ESA" gov_org_title = "European Space Agency" software_version = "2.0.0_201604" software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version software = "eos:ISCE-%s" % software_version software_location = "https://winsar.unavco.org/isce.html" algorithm = "eos:metadata_extraction" prod_dir = "file://%s%s" % (socket.getfqdn(), prod_dir) # put in fake start/end times so that prov:used and prov:generated # are properly created by the prov lib fake_time = datetime.utcnow().isoformat() + 'Z' job_id = "ingest-%s-%s" % (id, fake_time) bundle_id = "bundle-ingest-%s-%s" % (id, fake_time) doc = ProvEsDocument() #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id)) bndl = None input_id = "hysds:%s" % get_uuid(input_url) input_ds = doc.granule(input_id, None, [input_url], [instrument], None, level, None, label=os.path.basename(input_url), bundle=bndl) doc.collection(collection, None, collection_shortname, collection_label, [collection_loc], [instrument], level, version, label=collection_label, bundle=bndl) output_id = "hysds:%s" % get_uuid(prod_dir) output_ds = doc.granule(output_id, None, [prod_dir], [instrument], collection, level, version, label=id, bundle=bndl) doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl) doc.platform(platform, [instrument], label=platform_title, bundle=bndl) doc.instrument(instrument, platform, [sensor], [gov_org], label=instrument_title, bundle=bndl) doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl) doc.software(software, [algorithm], software_version, label=software_title, location=software_location, bundle=bndl) doc.processStep("hysds:%s" % get_uuid(job_id), fake_time, fake_time, [software], None, None, [input_ds.identifier], [output_ds.identifier], label=job_id, bundle=bndl, prov_type="hysds:ingest") with open(prov_file, 'w') as f: json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
def create_prov_es_json(id, netsel_file, jobdesc_file, project, aria_dem_xml, aria_dem_file, prod_dir, work_dir, prov_file): """Create provenance JSON file.""" # put in fake start/end times so that prov:used and prov:generated # are properly created by the prov lib fake_time = datetime.utcnow().isoformat() + 'Z' job_id = "create_interferogram-%s" % fake_time bundle_id = "bundle-create_interferogram-%s" % fake_time # create PROV-ES doc doc = ProvEsDocument() #bndl = doc.bundle("hysds:%s" % get_uuid(bundle_id)) bndl = None # input and output identifiers input_ids = {} platform_ids = {} instrument_ids = {} # full url paths work_url = "file://%s%s" % (socket.getfqdn(), work_dir) prod_url = "%s/%s" % (work_url, prod_dir) # add network selector file #netsel_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, netsel_file)), netsel_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, netsel_file)), ["%s/%s" % (work_url, netsel_file)], label=os.path.basename(netsel_file)) input_ids[netsel_ent.identifier] = True # add job description file #jobdesc_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, jobdesc_file)), jobdesc_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, jobdesc_file)), ["%s/%s" % (work_url, jobdesc_file)], label=os.path.basename(jobdesc_file)) input_ids[jobdesc_ent.identifier] = True # get list of CSK urls level = "L0" version = "v1.0" sensor = "eos:SAR" sensor_title = "Synthetic-aperture radar (SAR)" gov_org = "eos:ASI" gov_org_title = "Agenzia Spaziale Italiana" doc.governingOrganization(gov_org, label=gov_org_title, bundle=bndl) instrument = "" for i, url in enumerate(get_netsel_urls(netsel_file)): match = PLATFORM_RE.search(url) if not match: continue pf = match.group(1) platform = "eos:%s" % pf platform_title = "COSMO-SkyMed Satellite %s" % pf[-1] instrument = "eos:%s-SAR" % pf instrument_title = "%s-SAR" % pf input_ds = doc.product("hysds:%s" % get_uuid(url), None, [url], [instrument], None, level, version, label=os.path.basename(url), bundle=bndl) input_ids[input_ds.identifier] = True if platform not in platform_ids: doc.platform(platform, [instrument], label=platform_title, bundle=bndl) platform_ids[platform] = True if instrument not in instrument_ids: doc.instrument(instrument, platform, [sensor], [gov_org], label=instrument_title, bundle=bndl) doc.sensor(sensor, instrument, label=sensor_title, bundle=bndl) instrument_ids[instrument] = True # add dem xml, file and related provenance srtm_platform = "eos:SpaceShuttleEndeavour" srtm_platform_title = "USS Endeavour" srtm_instrument = "eos:SRTM" srtm_instrument_title = "Shuttle Radar Topography Mission (SRTM)" srtm_sensor = "eos:radar" srtm_sensor_title = "radar" srtm_gov_org = "eos:JPL" srtm_gov_org_title = "Jet Propulsion Laboratory" doc.governingOrganization(srtm_gov_org, label=srtm_gov_org_title, bundle=bndl) #dem_xml_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_xml)), dem_xml_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_xml)), ["%s/%s" % (work_url, aria_dem_xml)], label=os.path.basename(aria_dem_xml)) input_ids[dem_xml_ent.identifier] = True #dem_file_ent = bndl.entity("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_file)), dem_file_ent = doc.file("hysds:%s" % get_uuid("%s/%s" % (work_url, aria_dem_file)), ["%s/%s" % (work_url, aria_dem_file)], label=os.path.basename(aria_dem_file)) input_ids[dem_file_ent.identifier] = True doc.platform(srtm_platform, [srtm_instrument], label=srtm_platform_title, bundle=bndl) doc.instrument(srtm_instrument, srtm_platform, [srtm_sensor], [srtm_gov_org], label=srtm_instrument_title, bundle=bndl) doc.sensor(srtm_sensor, srtm_instrument, label=srtm_sensor_title, bundle=bndl) instrument_ids[srtm_instrument] = True # software and algorithm algorithm = "eos:interferogram_generation" software_version = "2.0.0_201604" software_title = "InSAR SCE (InSAR Scientific Computing Environment) v%s" % software_version software = "eos:ISCE-%s" % software_version software_location = "https://winsar.unavco.org/isce.html" doc.software(software, [algorithm], software_version, label=software_title, location=software_location, bundle=bndl) # output int_level = "L2" int_version = "v1.0" int_collection = "eos:CSK-interferograms-%s" % int_version int_collection_shortname = "CSK-interferograms-%s" % int_version int_collection_label = "ISCE generated CSK interferograms %s" % int_version int_collection_loc = "https://aria-dav.jpl.nasa.gov/repository/products/interferogram/%s" % int_version doc.collection(int_collection, None, int_collection_shortname, int_collection_label, [int_collection_loc], instrument_ids.keys(), int_level, int_version, label=int_collection_label, bundle=bndl) output_ds = doc.granule("hysds:%s" % get_uuid(prod_url), None, [prod_url], instrument_ids.keys(), int_collection, int_level, int_version, label=id, bundle=bndl) # runtime context rt_ctx_id = "hysds:runtimeContext-ariamh-%s" % project doc.runtimeContext(rt_ctx_id, [project], label=project, bundle=bndl) # create process doc.processStep("hysds:%s" % get_uuid(job_id), fake_time, fake_time, [software], None, rt_ctx_id, input_ids.keys(), [output_ds.identifier], label=job_id, bundle=bndl, prov_type="hysds:create_interferogram") # write with open(prov_file, 'w') as f: json.dump(json.loads(doc.serialize()), f, indent=2, sort_keys=True)
def get_image_prov(j, gcis_url): """Generate PROV-ES JSON from GCIS image metadata.""" # create doc doc = ProvEsDocument() bndl = None # create image, figure, chapter and report entities img_id = GCIS["%s" % j['uri'][1:].replace('/', '-')] img_title = j['title'] img_url = None img_thumbnail_url = None for file_md in j.get('files', []): img_url = file_md['href'] img_thumbnail_url = file_md['thumbnail_href'] img_attrs = [ ( PROV_TYPE, GCIS['Image'] ), ( PROV_LABEL, img_title ), ] if img_url is None: img_attrs.append(( PROV_LOCATION, "%s%s" % (gcis_url, j['uri']) )) else: img_attrs.append(( PROV_LOCATION, img_url )) if img_thumbnail_url is None: img_attrs.append(( HYSDS['thumbnail'], img_thumbnail_url )) doc.entity(img_id, img_attrs) reports = [] chapters = [] findings = [] figures = [] for figure in j.get('figures', []): report_uri = "/report/%s" % figure['report_identifier'] chapter_uri = "/chapter/%s" % figure['chapter_identifier'] figure_uri = "/figure/%s" % figure['identifier'] # create report r = requests.get('%s%s.json' % (gcis_url, report_uri)) r.raise_for_status() report = r.json() report_id = GCIS["%s" % report_uri[1:].replace('/', '-')] if report_id not in reports: doc.entity(report_id, [ ( PROV_TYPE, GCIS['Report'] ), ( PROV_LABEL, report['title'] ), ( PROV_LOCATION, report['url'] ), ]) reports.append(report_id) # create chapter r = requests.get('%s%s%s.json' % (gcis_url, report_uri, chapter_uri)) if r.status_code != 200: print("Failed with %d code: %s" % (r.status_code, r.content)) continue r.raise_for_status() chapter = r.json() chapter_id = GCIS["%s" % chapter_uri[1:].replace('/', '-')] if chapter_id not in chapters: doc.entity(chapter_id, [ ( PROV_TYPE, GCIS['Chapter'] ), ( PROV_LABEL, chapter['title'] ), ( PROV_LOCATION, chapter['url'] ), ]) chapters.append(chapter_id) doc.hadMember(report_id, chapter_id) # create findings r = requests.get('%s%s%s/finding.json' % (gcis_url, report_uri, chapter_uri)) r.raise_for_status() for f in r.json(): finding_id = GCIS["%s" % f['identifier']] if finding_id not in findings: doc.entity(finding_id, [ ( PROV_TYPE, GCIS['Finding'] ), ( PROV_LABEL, f['identifier'] ), ( PROV_LOCATION, f['href'] ), ]) findings.append(finding_id) doc.hadMember(report_id, finding_id) doc.hadMember(chapter_id, finding_id) # create figure r = requests.get('%s%s%s%s.json' % (gcis_url, report_uri, chapter_uri, figure_uri)) r.raise_for_status() figure_md = r.json() figure_id = GCIS["%s" % figure_uri[1:].replace('/', '-')] if figure_id not in figures: doc.entity(figure_id, [ ( PROV_TYPE, GCIS['Figure'] ), ( PROV_LABEL, figure_md['title'] ), ( PROV_LOCATION, "%s%s" % (gcis_url, figure_md['uri']) ), ]) figures.append(figure_id) doc.hadMember(chapter_id, figure_id) doc.hadMember(figure_id, img_id) # create agents or organizations agent_ids = {} org_ids = {} for cont in j.get('contributors', []): # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')] # create person if len(cont['person']) > 0: # agent agent_name = " ".join([cont['person'][i] for i in ('first_name', 'middle_name', 'last_name') if cont['person'].get(i, None) is not None]) doc.agent(agent_id, [ ( PROV_TYPE, GCIS["Person"] ), ( PROV_LABEL, agent_name ), ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ), ]) agent_ids[agent_id] = [] # organization if len(cont['organization']) > 0: org = cont['organization'] org_id = GCIS["%s" % cont['organization']['identifier']] if org_id not in org_ids: doc.governingOrganization(org_id, cont['organization']['name']) org_ids[org_id] = True if agent_id in agent_ids: agent_ids[agent_id].append(org_id) # create activity start_time = j['create_dt'] end_time = j['create_dt'] for parent in j.get('parents', []): input_id = GCIS["%s" % parent['url'][1:].replace('/', '-')] input_name = parent['label'] doc.entity(input_id, [ ( PROV_TYPE, GCIS["Dataset"] ), ( PROV_LABEL, input_name ), ( PROV_LOCATION, "%s%s" % (gcis_url, parent['url']) ), ]) # some activity uri's are null if parent['activity_uri'] is None: act_id = GCIS["derive-from-%s" % input_id] else: act_id = GCIS["%s" % parent['activity_uri'][1:].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Contributor']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': GCIS['worksAt']}) for org_id in org_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Funder']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.used(act, input_id, start_time, GCIS["%s" % get_uuid("%s:%s" % (act_id, input_id))]) doc.wasGeneratedBy(img_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (img_id, act_id))]) # serialize prov_json = json.loads(doc.serialize()) # for hadMember relations, add prov:type for hm_id in prov_json.get('hadMember', {}): hm = prov_json['hadMember'][hm_id] col = hm['prov:collection'] ent = hm['prov:entity'] if col in reports and ent in chapters: hm['prov:type'] = GCIS['hasChapter'] elif col in chapters and ent in figures: hm['prov:type'] = GCIS['hasFigure'] elif col in figures and ent == img_id: hm['prov:type'] = GCIS['hasImage'] #print(json.dumps(prov_json, indent=2)) return prov_json
def get_doc_prov(j, gcis_url, refList, journalList, organizationList, personList, dump_dir): """Generate PROV-ES JSON from GCIS doc metadata.""" gcis_ns = "https://gcis-search-stage.jpl.net:3000/gcis.owl#" doc = ProvEsDocument() bndl = None article = requests.get(j['href']).json() journalID = None #make journal if any(journal['identifier'] == article['journal_identifier'] for journal in journalList): journal = next(jour for jour in journalList if jour['identifier'] == article['journal_identifier']) journalAttrs = [ ("prov:type", 'gcis:Journal'), ("prov:label", journal['title']), ("prov:location", "%s%s"%(gcis_url,journal['uri'])), ("gcis:online_issn", journal['online_issn']), ("gcis:print_issn", journal['print_issn']), ("gcis:publisher", journal['publisher']), ] journalID = 'bibo:%s'%journal['identifier'] doc.entity(journalID, journalAttrs) #get organization/publisher if any if journal['publisher'] is not None: if any(organization['name'] == journal['publisher'] for organization in organizationList): organization = next(org for org in organizationList if org['name'] == journal['publisher']) org_attrs = [ ("prov:type", 'gcis:organization'), ("prov:label", organization['name']), ("prov:location", "%s%s"%(gcis_url,organization['uri'])), ("gcis:organization_type_identifier", organization['organization_type_identifier']), ("gcis:country_code", organization['country_code']), ] org_id = 'bibo:%s'%organization['identifier'] doc.entity(org_id, org_attrs) doc.governingOrganization(org_id, organization['name']) #make article articleAttrs = [ ("prov:type", 'gcis:Article'), ("prov:label", article['title']), ("prov:location","%s%s"%(gcis_url, article['uri'])), ("dcterms:isPartOf", journalID), ] articleID = 'bibo:%s'%article['identifier'] doc.entity(articleID, articleAttrs) #link journal to article if journalID is not None: doc.hadMember(journalID, articleID) agent_ids = {} org_ids = {} #contributors if 'contributors' in article: for contributor in article['contributors']: personID = None if contributor['person_uri'] is not None: name = " ".join([contributor['person'][i] for i in ('first_name', 'middle_name', 'last_name') if contributor['person'].get(i, None) is not None]) personAttrs = [ ("prov:type", 'gcis:Person'), ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])), ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])), ("gcis:id", str(contributor['person_id'])), ("gcis:orcid", contributor['person']['orcid']) ] personID = 'bibo:%s'%contributor['person_id'] agent_ids[personID] = [] doc.agent(personID, personAttrs) if contributor['organization'] is not None: #make org org_attrs = [ ("prov:type", "gcis:organization"), ("prov:label", contributor["organization"]["name"]), ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])), ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]), ("gcis:country_code", contributor["organization"]["country_code"]), ] orgID = 'bibo:%s'%contributor['organization']['identifier'] #doc.entity(orgID, org_attrs) doc.governingOrganization(orgID, contributor['organization']['name']) org_ids[orgID] = True if personID in agent_ids: agent_ids[personID].append(orgID) #create actvity if isinstance(j['year'], int): start_time = str(j['year']) end_time = str(j['year']) else: start_time = None end_time = None act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(articleID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(articleID, act_id))]) #cited by? if 'cited_by' in article: for citation in article['cited_by']: if 'publication' in citation: #pub_uri = "%s%s"%(gcis_url, citation['publication']) itemType = citation['publication'].split("/")[1] itemList = get_itemList(dump_dir, itemType) if any(item['uri'] == citation['publication'] for item in itemList): item = next(obj for obj in itemList if obj['uri'] == citation['publication']) item_id = 'bibo:%s'%item['identifier'] doc.wasDerivedFrom(item_id, articleID) print articleID prov_json = json.loads(doc.serialize()) return prov_json