def get_doc_prov(j, gcis_url, refList, personList, reportList, webpageList):#organizationList, activityList): """Generate PROV-ES JSON from GCIS doc metadata.""" gcis_ns = "http://data.globalchange.gov/gcis.owl#" doc = ProvEsDocument() dataset = requests.get(j['href']).json() datasetID = 'bibo:%s' % j['identifier'] doc_attrs = [ ("prov:type", 'gcis:Dataset'), ("prov:label", j['name']), ("prov:location", "%s%s"%(gcis_url, j['uri'])), ] doc.entity('bibo:%s' % j['identifier'], doc_attrs) agent_ids = {} org_ids = {} #contributors if 'contributors' in dataset: for contributor in dataset['contributors']: personID = None if contributor['person_uri'] is not None: name = " ".join([contributor['person'][i] for i in ('first_name', 'middle_name', 'last_name') if contributor['person'].get(i, None) is not None]) personAttrs = [ ("prov:type", 'gcis:Person'), ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])), ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])), ("gcis:id", str(contributor['person_id'])), ("gcis:orcid", contributor['person']['orcid']) ] personID = 'bibo:%s'%contributor['person_id'] agent_ids[personID] = [] doc.agent(personID, personAttrs) if contributor['organization'] is not None: #make org org_attrs = [ ("prov:type", "gcis:organization"), ("prov:label", contributor["organization"]["name"]), ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])), ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]), ("gcis:country_code", contributor["organization"]["country_code"]), ] orgID = 'bibo:%s'%contributor['organization']['identifier'] #doc.entity(orgID, org_attrs) doc.governingOrganization(orgID, contributor['organization']['name']) org_ids[orgID] = True if personID in agent_ids: agent_ids[personID].append(orgID) #create actvity if dataset['start_time'] is not None: start_time = str(dataset['start_time']) else: start_time = "" if dataset['end_time'] is not None: end_time = str(dataset['end_time']) else: end_time = "" act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))]) #aliases #instrument measurements """ role_ids = {} agent_ids = {} org_ids = {} personID = None #contributors if 'contributors' in dataset: for contributor in dataset['contributors']: if contributor['person_uri'] is not None: name = " ".join([contributor['person'][i] for i in ('first_name', 'middle_name', 'last_name') if contributor['person'].get(i, None) is not None]) personAttrs = [ ("prov:type", 'gcis:Person'), ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])), ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])), ("gcis:id", str(contributor['person_id'])), ("gcis:orcid", contributor['person']['orcid']) ] personID = 'bibo:%s'%contributor['person_id'] role_ids[personID] = contributor['role_type_identifier'] doc.agent(personID, personAttrs) agent_ids[personID] = [] #doc.wasAssociatedWith(datasetID, personID, None, None,{"prov:role": contributor['role_type_identifier']} ) if contributor['organization'] is not None: org_attrs = [ ("prov:type", "gcis:organization"), ("prov:label", contributor["organization"]["name"]), ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])), ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]), ("gcis:country_code", contributor["organization"]["country_code"]), ] orgID = 'bibo:%s'%contributor['organization']['identifier'] doc.agent(orgID, org_attrs) doc.governingOrganization(orgID, contributor['organization']['name']) if personID in agent_ids: agent_ids[personID].append(orgID) if dataset['start_time'] is not None: start_time = str(dataset['start_time']) else: start_time = "" if dataset['end_time'] is not None: end_time = str(dataset['end_time']) else: end_time = "" # print j['identifier'] #else: # start_time = None # end_time = None act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS[role_ids[agent_id]] }) for org_id in agent_ids[agent_id]: del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))]) """ #cited by if 'cited_by' in dataset: for citation in dataset['cited_by']: if 'publication' in citation: #pub_uri = "%s%s"%(gcis_url, citation['publication']) itemType = citation['publication'].split("/")[1] itemList = get_itemList(dump_dir, itemType) if any(item['uri'] == citation['publication'] for item in itemList): item = next(obj for obj in itemList if obj['uri'] == citation['publication']) item_id = 'bibo:%s'%item['identifier'] doc.wasDerivedFrom(item_id, datasetID) prov_json = json.loads(doc.serialize()) return prov_json
def get_doc_prov(j, gcis_url, refList, journalList, organizationList, personList, dump_dir): """Generate PROV-ES JSON from GCIS doc metadata.""" gcis_ns = "https://gcis-search-stage.jpl.net:3000/gcis.owl#" doc = ProvEsDocument() bndl = None article = requests.get(j['href']).json() journalID = None #make journal if any(journal['identifier'] == article['journal_identifier'] for journal in journalList): journal = next(jour for jour in journalList if jour['identifier'] == article['journal_identifier']) journalAttrs = [ ("prov:type", 'gcis:Journal'), ("prov:label", journal['title']), ("prov:location", "%s%s"%(gcis_url,journal['uri'])), ("gcis:online_issn", journal['online_issn']), ("gcis:print_issn", journal['print_issn']), ("gcis:publisher", journal['publisher']), ] journalID = 'bibo:%s'%journal['identifier'] doc.entity(journalID, journalAttrs) #get organization/publisher if any if journal['publisher'] is not None: if any(organization['name'] == journal['publisher'] for organization in organizationList): organization = next(org for org in organizationList if org['name'] == journal['publisher']) org_attrs = [ ("prov:type", 'gcis:organization'), ("prov:label", organization['name']), ("prov:location", "%s%s"%(gcis_url,organization['uri'])), ("gcis:organization_type_identifier", organization['organization_type_identifier']), ("gcis:country_code", organization['country_code']), ] org_id = 'bibo:%s'%organization['identifier'] doc.entity(org_id, org_attrs) doc.governingOrganization(org_id, organization['name']) #make article articleAttrs = [ ("prov:type", 'gcis:Article'), ("prov:label", article['title']), ("prov:location","%s%s"%(gcis_url, article['uri'])), ("dcterms:isPartOf", journalID), ] articleID = 'bibo:%s'%article['identifier'] doc.entity(articleID, articleAttrs) #link journal to article if journalID is not None: doc.hadMember(journalID, articleID) agent_ids = {} org_ids = {} #contributors if 'contributors' in article: for contributor in article['contributors']: personID = None if contributor['person_uri'] is not None: name = " ".join([contributor['person'][i] for i in ('first_name', 'middle_name', 'last_name') if contributor['person'].get(i, None) is not None]) personAttrs = [ ("prov:type", 'gcis:Person'), ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])), ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])), ("gcis:id", str(contributor['person_id'])), ("gcis:orcid", contributor['person']['orcid']) ] personID = 'bibo:%s'%contributor['person_id'] agent_ids[personID] = [] doc.agent(personID, personAttrs) if contributor['organization'] is not None: #make org org_attrs = [ ("prov:type", "gcis:organization"), ("prov:label", contributor["organization"]["name"]), ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])), ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]), ("gcis:country_code", contributor["organization"]["country_code"]), ] orgID = 'bibo:%s'%contributor['organization']['identifier'] #doc.entity(orgID, org_attrs) doc.governingOrganization(orgID, contributor['organization']['name']) org_ids[orgID] = True if personID in agent_ids: agent_ids[personID].append(orgID) #create actvity if isinstance(j['year'], int): start_time = str(j['year']) end_time = str(j['year']) else: start_time = None end_time = None act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'}) for org_id in org_ids: waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.wasGeneratedBy(articleID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(articleID, act_id))]) #cited by? if 'cited_by' in article: for citation in article['cited_by']: if 'publication' in citation: #pub_uri = "%s%s"%(gcis_url, citation['publication']) itemType = citation['publication'].split("/")[1] itemList = get_itemList(dump_dir, itemType) if any(item['uri'] == citation['publication'] for item in itemList): item = next(obj for obj in itemList if obj['uri'] == citation['publication']) item_id = 'bibo:%s'%item['identifier'] doc.wasDerivedFrom(item_id, articleID) print articleID prov_json = json.loads(doc.serialize()) return prov_json