Ejemplo n.º 1
0
def get_doc_prov(j, gcis_url, refList, reportList):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument()    

    figure = requests.get(j['href']).json()
    figureID = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ("prov:type", 'gcis:Figure'),
        ("prov:label", j['title']),
        ("prov:location", "%s%s"%(gcis_url, j['uri'])),
        ]
    doc.entity('bibo:%s' % j['identifier'], doc_attrs)


    #create connection
    reportID = 'bibo:%s'%figure['report_identifier']
    chapterID = 'bibo:%s'%figure['chapter_identifier']
    doc.hadMember(reportID, chapterID)
    doc.used(reportID, figureID)


    prov_json = json.loads(doc.serialize())

    return prov_json
Ejemplo n.º 2
0
def get_doc_prov(j, gcis_url):
    """Generate PROV-ES JSON from GCIS doc metadata."""

    # create doc
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument(namespaces={
        "gcis": gcis_ns,
        "bibo": "http://purl.org/ontology/bibo/"
    })
    bndl = None

    # create journal
    r = requests.get("%s/journal/%s.json" %
                     (gcis_url, j['journal_identifier']),
                     params={'all': 1},
                     verify=False)
    r.raise_for_status()
    journal_md = r.json()
    doc_attrs = [
        ("prov:type", 'gcis:Journal'),
        ("prov:label", j['title']),
    ]
    journal_id = GCIS[j['journal_identifier']]
    if journal_id not in journal_ids:
        if journal_md.get('url', None) is not None:
            doc_attrs.append(("prov:location", journal_md['url']))
        if journal_md.get('online_issn', None) is not None:
            doc_attrs.append(("gcis:online_issn", journal_md['online_issn']))
        if journal_md.get('print_issn', None) is not None:
            doc_attrs.append(("gcis:print_issn", journal_md['print_issn']))
        doc.entity(journal_id, doc_attrs)
        journal_ids[journal_id] = True

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent
            agent_name = " ".join([
                cont['person'][i]
                for i in ('first_name', 'middle_name', 'last_name')
                if cont['person'].get(i, None) is not None
            ])
            doc.agent(agent_id, [
                (PROV_TYPE, GCIS["Person"]),
                (PROV_LABEL, agent_name),
                (PROV_LOCATION, "%s%s" % (gcis_url, cont['uri'])),
            ])
            agent_ids[agent_id] = []

        # organization
        if cont['organization'] is not None and len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create article
    article_id = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ("prov:type", 'gcis:Article'),
        ("prov:label", j['title']),
        ("dcterms:isPartOf", journal_id),
    ]
    if j.get('doi', "") == "":
        doc_attrs.append(("bibo:doi", j['doi']))
    doc.entity(article_id, doc_attrs)

    # link
    doc.hadMember(journal_id, article_id)

    # create activity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s" % j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id,
                              {'prov:role': GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s" % get_uuid("%s:%s:%s" %
                                          (agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id,
                           {'prov:type': 'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id,
                              {'prov:role': GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(article_id, act, end_time,
                       GCIS["%s" % get_uuid("%s:%s" % (article_id, act_id))])

    # serialize
    prov_json = json.loads(doc.serialize())

    return prov_json
Ejemplo n.º 3
0
def get_doc_prov(j, gcis_url):
    """Generate PROV-ES JSON from GCIS doc metadata."""

    # create doc
    gcis_ns = "http://data.globalchange.gov/gcis.owl#"
    doc = ProvEsDocument(namespaces={ "gcis": gcis_ns, "bibo": "http://purl.org/ontology/bibo/" })
    bndl = None

    # create journal
    r = requests.get("%s/journal/%s.json" % (gcis_url, j['journal_identifier']), params={ 'all': 1 }, verify=False)
    r.raise_for_status()
    journal_md = r.json()
    doc_attrs = [
        ( "prov:type", 'gcis:Journal' ),
        ( "prov:label", j['title'] ),
    ]
    journal_id = GCIS[j['journal_identifier']]
    if journal_id not in journal_ids:
        if journal_md.get('url', None) is not None:
            doc_attrs.append( ("prov:location", journal_md['url'] ) )
        if journal_md.get('online_issn', None) is not None:
            doc_attrs.append( ("gcis:online_issn", journal_md['online_issn'] ) )
        if journal_md.get('print_issn', None) is not None:
            doc_attrs.append( ("gcis:print_issn", journal_md['print_issn'] ) )
        doc.entity(journal_id, doc_attrs)
        journal_ids[journal_id] = True

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent 
            agent_name  = " ".join([cont['person'][i] for i in
                                    ('first_name', 'middle_name', 'last_name')
                                    if cont['person'].get(i, None) is not None])
            doc.agent(agent_id, [
                ( PROV_TYPE, GCIS["Person"] ),
                ( PROV_LABEL, agent_name ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ),
            ])
            agent_ids[agent_id] = []

        # organization
        if cont['organization'] is not None and len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:          
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create article
    article_id = 'bibo:%s' % j['identifier']
    doc_attrs = [
        ( "prov:type", 'gcis:Article' ),
        ( "prov:label", j['title'] ),
        ( "dcterms:isPartOf", journal_id ),
    ]
    if j.get('doi', "") == "": 
        doc_attrs.append( ("bibo:doi", j['doi'] ) )
    doc.entity(article_id, doc_attrs)

    # link
    doc.hadMember(journal_id, article_id)

    # create activity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s" % j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': 'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(article_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (article_id, act_id))])

    # serialize
    prov_json = json.loads(doc.serialize())

    return prov_json
Ejemplo n.º 4
0
def get_image_prov(j, gcis_url):
    """Generate PROV-ES JSON from GCIS image metadata."""

    # create doc
    doc = ProvEsDocument()
    bndl = None

    # create image, figure, chapter and report entities
    img_id = GCIS["%s" % j['uri'][1:].replace('/', '-')]
    img_title = j['title']
    img_url = None
    img_thumbnail_url = None
    for file_md in j.get('files', []):
        img_url = file_md['href']
        img_thumbnail_url = file_md['thumbnail_href']
    img_attrs = [
        ( PROV_TYPE, GCIS['Image'] ),
        ( PROV_LABEL, img_title ),
    ]
    if img_url is None:
        img_attrs.append(( PROV_LOCATION, "%s%s" % (gcis_url, j['uri']) ))
    else:
        img_attrs.append(( PROV_LOCATION, img_url ))
    if img_thumbnail_url is None:
        img_attrs.append(( HYSDS['thumbnail'], img_thumbnail_url ))
    doc.entity(img_id, img_attrs)
    reports = []
    chapters = []
    findings = []
    figures = []
    for figure in j.get('figures', []):
        report_uri = "/report/%s" % figure['report_identifier']
        chapter_uri = "/chapter/%s" % figure['chapter_identifier']
        figure_uri = "/figure/%s" % figure['identifier']

        # create report
        r = requests.get('%s%s.json' % (gcis_url, report_uri))
        r.raise_for_status()
        report = r.json()
        report_id = GCIS["%s" % report_uri[1:].replace('/', '-')]
        if report_id not in reports:
            doc.entity(report_id, [
                ( PROV_TYPE, GCIS['Report'] ),
                ( PROV_LABEL, report['title'] ),
                ( PROV_LOCATION, report['url'] ),
            ])
            reports.append(report_id)

        # create chapter
        r = requests.get('%s%s%s.json' % (gcis_url, report_uri, chapter_uri))
        if r.status_code != 200:
            print("Failed with %d code: %s" % (r.status_code, r.content))
            continue
        r.raise_for_status()
        chapter = r.json()
        chapter_id = GCIS["%s" % chapter_uri[1:].replace('/', '-')]
        if chapter_id not in chapters:
            doc.entity(chapter_id, [
                ( PROV_TYPE, GCIS['Chapter'] ),
                ( PROV_LABEL, chapter['title'] ),
                ( PROV_LOCATION, chapter['url'] ),
            ])
            chapters.append(chapter_id)
        doc.hadMember(report_id, chapter_id)
         
        # create findings
        r = requests.get('%s%s%s/finding.json' % (gcis_url, report_uri, chapter_uri))
        r.raise_for_status()
        for f in r.json():
            finding_id = GCIS["%s" % f['identifier']]
            if finding_id not in findings:
                doc.entity(finding_id, [
                    ( PROV_TYPE, GCIS['Finding'] ),
                    ( PROV_LABEL, f['identifier'] ),
                    ( PROV_LOCATION, f['href'] ),
                ])
                findings.append(finding_id)
            doc.hadMember(report_id, finding_id)
            doc.hadMember(chapter_id, finding_id)
         
        # create figure
        r = requests.get('%s%s%s%s.json' % (gcis_url, report_uri, chapter_uri, figure_uri))
        r.raise_for_status()
        figure_md = r.json()
        figure_id = GCIS["%s" % figure_uri[1:].replace('/', '-')]
        if figure_id not in figures:
            doc.entity(figure_id, [
                ( PROV_TYPE, GCIS['Figure'] ),
                ( PROV_LABEL, figure_md['title'] ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, figure_md['uri']) ),
            ])
            figures.append(figure_id)
            doc.hadMember(chapter_id, figure_id)
        doc.hadMember(figure_id, img_id)

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent 
            agent_name  = " ".join([cont['person'][i] for i in
                                   ('first_name', 'middle_name', 'last_name')
                                   if cont['person'].get(i, None) is not None])
            doc.agent(agent_id, [
                ( PROV_TYPE, GCIS["Person"] ),
                ( PROV_LABEL, agent_name ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ),
            ])
            agent_ids[agent_id] = []

        # organization
        if len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:          
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create activity
    start_time = j['create_dt']
    end_time = j['create_dt']
    for parent in j.get('parents', []):
        input_id = GCIS["%s" % parent['url'][1:].replace('/', '-')]
        input_name = parent['label']
        doc.entity(input_id, [
            ( PROV_TYPE, GCIS["Dataset"] ),
            ( PROV_LABEL, input_name ),
            ( PROV_LOCATION, "%s%s" % (gcis_url, parent['url']) ),
        ])
        # some activity uri's are null
        if parent['activity_uri'] is None:
            act_id = GCIS["derive-from-%s" % input_id]
        else:
            act_id = GCIS["%s" % parent['activity_uri'][1:].replace('/', '-')]
        attrs = []
        for agent_id in agent_ids:
            waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
            doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Contributor']})
            for org_id in agent_ids[agent_id]:
                del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))]
                doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': GCIS['worksAt']})
        for org_id in org_ids:
            waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
            doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Funder']})
        act = doc.activity(act_id, start_time, end_time, attrs)
        doc.used(act, input_id, start_time, GCIS["%s" % get_uuid("%s:%s" % (act_id, input_id))])
        doc.wasGeneratedBy(img_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (img_id, act_id))])
           
    # serialize
    prov_json = json.loads(doc.serialize())

    # for hadMember relations, add prov:type
    for hm_id in prov_json.get('hadMember', {}):
        hm = prov_json['hadMember'][hm_id]
        col = hm['prov:collection'] 
        ent = hm['prov:entity'] 
        if col in reports and ent in chapters:
            hm['prov:type'] = GCIS['hasChapter']
        elif col in chapters and ent in figures:
            hm['prov:type'] = GCIS['hasFigure']
        elif col in figures and ent == img_id:
            hm['prov:type'] = GCIS['hasImage']

    #print(json.dumps(prov_json, indent=2))

    return prov_json
Ejemplo n.º 5
0
def get_doc_prov(j, gcis_url, refList, journalList, organizationList, personList, dump_dir):
    """Generate PROV-ES JSON from GCIS doc metadata."""
    gcis_ns = "https://gcis-search-stage.jpl.net:3000/gcis.owl#"
    doc = ProvEsDocument()
    bndl = None
    
    article = requests.get(j['href']).json()

    journalID = None

    #make journal
    if any(journal['identifier'] == article['journal_identifier'] for journal in journalList):
        journal = next(jour for jour in journalList if jour['identifier'] == article['journal_identifier'])
        journalAttrs = [
                ("prov:type", 'gcis:Journal'),
                ("prov:label", journal['title']),
                ("prov:location", "%s%s"%(gcis_url,journal['uri'])),
                ("gcis:online_issn", journal['online_issn']),
                ("gcis:print_issn", journal['print_issn']),
                ("gcis:publisher", journal['publisher']),
                ]
        journalID = 'bibo:%s'%journal['identifier']
        doc.entity(journalID, journalAttrs)

    #get organization/publisher if any
    if journal['publisher'] is not None:
        if any(organization['name'] == journal['publisher'] for organization in organizationList):
            organization = next(org for org in organizationList if org['name'] == journal['publisher'])
            org_attrs = [
                    ("prov:type", 'gcis:organization'),
                    ("prov:label", organization['name']),
                    ("prov:location", "%s%s"%(gcis_url,organization['uri'])),
                    ("gcis:organization_type_identifier", organization['organization_type_identifier']),
                    ("gcis:country_code", organization['country_code']),
                    ]
            org_id = 'bibo:%s'%organization['identifier']
            doc.entity(org_id, org_attrs)

            doc.governingOrganization(org_id, organization['name'])

    #make article
    articleAttrs = [
        ("prov:type", 'gcis:Article'),
        ("prov:label", article['title']),
        ("prov:location","%s%s"%(gcis_url, article['uri'])),
        ("dcterms:isPartOf", journalID),
    ]
    articleID = 'bibo:%s'%article['identifier']
    doc.entity(articleID, articleAttrs)

    #link journal to article
    if journalID is not None:
        doc.hadMember(journalID, articleID)

    agent_ids = {}
    org_ids = {}
    #contributors
    if 'contributors' in article:
        for contributor in article['contributors']:
            personID = None 
            if contributor['person_uri'] is not None:
                name  = " ".join([contributor['person'][i] 
                                for i in ('first_name', 'middle_name', 'last_name')
                                if contributor['person'].get(i, None) is not None])
                personAttrs = [
                        ("prov:type", 'gcis:Person'),
                        ("prov:label", "%s"%name),# %s"%(contributor['person']['first_name'],contributor['person']['last_name'])),
                        ("prov:location", "%s%s"%(gcis_url,contributor['person_uri'])),
                        ("gcis:id", str(contributor['person_id'])),
                        ("gcis:orcid", contributor['person']['orcid'])
                        ]
                personID = 'bibo:%s'%contributor['person_id']
                agent_ids[personID] = []
                doc.agent(personID, personAttrs) 
            if contributor['organization'] is not None:
                #make org
                org_attrs = [
                    ("prov:type", "gcis:organization"),
                    ("prov:label", contributor["organization"]["name"]),
                    ("prov:location", "%s%s"%(gcis_url, contributor["organization_uri"])),
                    ("gcis:organization_type_identifier", contributor["organization"]["organization_type_identifier"]),
                    ("gcis:country_code", contributor["organization"]["country_code"]),
                ]
                orgID = 'bibo:%s'%contributor['organization']['identifier']
                #doc.entity(orgID, org_attrs)
                doc.governingOrganization(orgID, contributor['organization']['name'])
                org_ids[orgID] = True
                if personID in agent_ids:
                    agent_ids[personID].append(orgID)

    #create actvity
    if isinstance(j['year'], int):
        start_time = str(j['year'])
        end_time = str(j['year'])
    else:
        start_time = None
        end_time = None
    act_id = GCIS["generate-%s"%j['identifier'].replace('/', '-')]
    attrs = []
    for agent_id in agent_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, agent_id))]
        doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role':GCIS['Author']})
        for org_id in agent_ids[agent_id]:
            del_id = GCIS["%s"%get_uuid("%s:%s:%s"%(agent_id, org_id, act_id))]
            doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type':'gcis:worksAt'})
    for org_id in org_ids:
        waw_id = GCIS["%s"%get_uuid("%s:%s"%(act_id, org_id))]
        doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role':GCIS['Contributor']})
    act = doc.activity(act_id, start_time, end_time, attrs)
    doc.wasGeneratedBy(articleID, act, end_time, GCIS["%s"%get_uuid("%s:%s"%(articleID, act_id))])


    #cited by?
    if 'cited_by' in article:
        for citation in article['cited_by']:
            if 'publication' in citation:
                #pub_uri = "%s%s"%(gcis_url, citation['publication'])
                itemType = citation['publication'].split("/")[1]
                
                itemList = get_itemList(dump_dir, itemType)
                if any(item['uri'] == citation['publication'] for item in itemList):
                    item = next(obj for obj in itemList if obj['uri'] == citation['publication'])
                    item_id = 'bibo:%s'%item['identifier']
                    doc.wasDerivedFrom(item_id, articleID)
                    print articleID
    prov_json = json.loads(doc.serialize())

    return prov_json