コード例 #1
0
def get_doc_prov(j, gcis_url, refList, personList, reportList, webpageList):#organizationList, activityList):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument()
    
    dataset = requests.get(j['href']).json()
    datasetID = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ("prov:type", 'gcis:Dataset'),
        ("prov:label", j['name']),
        ("prov:location", "%s%s"%(gcis_url, j['uri'])),
        ]
    doc.entity('bibo:%s' % j['identifier'], doc_attrs)

    agent_ids = {}
    org_ids = {}
    #contributors
    if 'contributors' in dataset:
        for contributor in dataset['contributors']:
            personID = None 
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                    for i in ('first_name', 'middle_name', 'last_name')
                    if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                agent_ids[personID] = []
                doc.agent(personID, personAttrs) 
            if contributor['organization'] is not None:
                #make org
                org_attrs = [
                        ("prov:type", "gcis:organization"),
                        ("prov:label", contributor["organization"]["name"]),
                        ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                        ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                        ("gcis:country_code", contributor["organization"]["country_code"]),
                        ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                #doc.entity(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                org_ids[orgID] = True
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)

    #create actvity
    if dataset['start_time'] is not None:
        start_time = str(dataset['start_time'])
    else:
        start_time = ""
    if dataset['end_time'] is not None:
        end_time = str(dataset['end_time'])
    else:
        end_time = ""
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))])

       
    #aliases

    #instrument measurements

    """
    role_ids = {}
    agent_ids = {}
    org_ids = {}
    personID = None
    #contributors
    if 'contributors' in dataset:
        for contributor in dataset['contributors']:
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                    for i in ('first_name', 'middle_name', 'last_name')
                    if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                role_ids[personID] = contributor['role_type_identifier']

                doc.agent(personID, personAttrs)
                agent_ids[personID] = []
                #doc.wasAssociatedWith(datasetID, personID, None, None,{"prov:role": contributor['role_type_identifier']} )
            if contributor['organization'] is not None:
                org_attrs = [
                        ("prov:type", "gcis:organization"),
                        ("prov:label", contributor["organization"]["name"]),
                        ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                        ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                        ("gcis:country_code", contributor["organization"]["country_code"]),
                        ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                doc.agent(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)
    
    if dataset['start_time'] is not None:
        start_time = str(dataset['start_time'])
    else:
        start_time = ""
    if dataset['end_time'] is not None:
        end_time = str(dataset['end_time'])
    else:
        end_time = ""
    #    print j['identifier']
    #else:
    #    start_time = None
    #    end_time = None
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS[role_ids[agent_id]] })
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(datasetID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(datasetID, act_id))])

    """




    #cited by
    if 'cited_by' in dataset:
        for citation in dataset['cited_by']:
            if 'publication' in citation:
                #pub_uri = "%s%s"%(gcis_url, citation['publication'])
                itemType = citation['publication'].split("/")[1]

                itemList = get_itemList(dump_dir, itemType)
                if any(item['uri'] == citation['publication'] for item in itemList):
                    item = next(obj for obj in itemList if obj['uri'] == citation['publication'])
                    item_id = 'bibo:%s'%item['identifier']
                    doc.wasDerivedFrom(item_id, datasetID)

    prov_json = json.loads(doc.serialize())

    return prov_json
コード例 #2
0
def get_doc_prov(j, gcis_url, refList, journalList, organizationList, personList, dump_dir):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    gcis_ns = "https://gcis-search-stage.jpl.net:3000/gcis.owl#"
    doc = ProvEsDocument()
    bndl = None
    
    article = requests.get(j['href']).json()

    journalID = None

    #make journal
    if any(journal['identifier'] == article['journal_identifier'] for journal in journalList):
        journal = next(jour for jour in journalList if jour['identifier'] == article['journal_identifier'])
        journalAttrs = [
                ("prov:type", 'gcis:Journal'),
                ("prov:label", journal['title']),
                ("prov:location", "%s%s"%(gcis_url,journal['uri'])),
                ("gcis:online_issn", journal['online_issn']),
                ("gcis:print_issn", journal['print_issn']),
                ("gcis:publisher", journal['publisher']),
                ]
        journalID = 'bibo:%s'%journal['identifier']
        doc.entity(journalID, journalAttrs)

    #get organization/publisher if any
    if journal['publisher'] is not None:
        if any(organization['name'] == journal['publisher'] for organization in organizationList):
            organization = next(org for org in organizationList if org['name'] == journal['publisher'])
            org_attrs = [
                    ("prov:type", 'gcis:organization'),
                    ("prov:label", organization['name']),
                    ("prov:location", "%s%s"%(gcis_url,organization['uri'])),
                    ("gcis:organization_type_identifier", organization['organization_type_identifier']),
                    ("gcis:country_code", organization['country_code']),
                    ]
            org_id = 'bibo:%s'%organization['identifier']
            doc.entity(org_id, org_attrs)

            doc.governingOrganization(org_id, organization['name'])

    #make article
    articleAttrs = [
        ("prov:type", 'gcis:Article'),
        ("prov:label", article['title']),
        ("prov:location","%s%s"%(gcis_url, article['uri'])),
        ("dcterms:isPartOf", journalID),
    ]
    articleID = 'bibo:%s'%article['identifier']
    doc.entity(articleID, articleAttrs)

    #link journal to article
    if journalID is not None:
        doc.hadMember(journalID, articleID)

    agent_ids = {}
    org_ids = {}
    #contributors
    if 'contributors' in article:
        for contributor in article['contributors']:
            personID = None 
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                                for i in ('first_name', 'middle_name', 'last_name')
                                if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                agent_ids[personID] = []
                doc.agent(personID, personAttrs) 
            if contributor['organization'] is not None:
                #make org
                org_attrs = [
                    ("prov:type", "gcis:organization"),
                    ("prov:label", contributor["organization"]["name"]),
                    ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                    ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                    ("gcis:country_code", contributor["organization"]["country_code"]),
                ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                #doc.entity(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                org_ids[orgID] = True
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)

    #create actvity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(articleID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(articleID, act_id))])


    #cited by?
    if 'cited_by' in article:
        for citation in article['cited_by']:
            if 'publication' in citation:
                #pub_uri = "%s%s"%(gcis_url, citation['publication'])
                itemType = citation['publication'].split("/")[1]
                
                itemList = get_itemList(dump_dir, itemType)
                if any(item['uri'] == citation['publication'] for item in itemList):
                    item = next(obj for obj in itemList if obj['uri'] == citation['publication'])
                    item_id = 'bibo:%s'%item['identifier']
                    doc.wasDerivedFrom(item_id, articleID)
                    print articleID
    prov_json = json.loads(doc.serialize())

    return prov_json