def populateValue(g, datasetId, ds, data, p, o, iriCache): ## Skipping following IRI's as they are handled separately (getResearcher, getProtocols, etc.) skipIri = [ term.URIRef('http://uri.interlex.org/temp/uris/contributorTo'), term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), term.URIRef('http://uri.interlex.org/temp/uris/hasUriApi'), term.URIRef('http://uri.interlex.org/temp/uris/hasUriHuman'), term.URIRef('http://uri.interlex.org/temp/uris/hasProtocol'), term.URIRef('http://uri.interlex.org/temp/uris/wasUpdatedAtTime')] key = strip_iri(p.strip()) if p in skipIri: return if isinstance(o, term.URIRef): value = iri_lookup(g, o.strip(), iriCache) if value: if isinstance(value, dict) and 'curie' in value: ds['term'][value['curie']] = value value = value['curie'] # if isinstance(value, dict) and 'iri' in value: # key = strip_iri(value['iri']) # ds['term'][key] = value # value = key if key in arrayProps: array = data.setdefault(key, []) array.append(value) else: if key in data: log.warning('Unexpected creation of array for: %s - %s - %s', datasetId, key, value) log.warning('Existing value for this key : %s - %s - %s', datasetId, key, data[key]) log.warning('----- Will use the shortest value -----') if len(value) < len(data[key]): data[key] = value else: data[key] = value elif isinstance(o, term.Literal): value = strip_iri(o.strip()) if key in arrayProps: array = data.setdefault(key, []) array.append(value) else: if key in data: log.warning('Unexpected creation of array for: %s - %s - %s', datasetId, key, value) log.warning('Existing value for this key : %s - %s - %s', datasetId, key, data[key]) log.warning('----- Will use the shortest value -----') if len(value) < len(data[key]): data[key] = value else: data[key] = value elif isinstance(o, term.BNode): data[key] = parseMeasure(datasetId, g, o, {'value': '', 'unit': ''}) else: raise Exception('Unknown RDF term: %s' % type(o))
def getResearchers(gNew, gDelta, output, iriCache): # Iterate over Researchers for s, o in gNew.subject_objects(URIRef('http://uri.interlex.org/temp/uris/contributorTo')): m = re.search(r".*(?P<ds>N:dataset:[:\w-]+)", o) datasetId = strip_iri(m.group(0).strip()) user = strip_iri(s) # user = s #s.split('/')[-1] # either a blackfynn user id or "Firstname-Lastname" newEntry = {} for p2, o2 in gDelta.predicate_objects(s): populateValue(gDelta, datasetId, output[datasetId], newEntry, p2, o2, iriCache) if newEntry: output[datasetId]['researcher'][user] = newEntry
def parseMeasure(dsId, g, node, values): if (node, None, URIRef('http://uri.interlex.org/tgbugs/uris/readable/sparc/Measurement')) in g: # Current BNode is a measurement # preds = g.predicates(subject=node) # for v in preds: # print('pred: {}'.format(v)) # values['unit'] = strip_iri(v) unit = strip_iri(g.value(subject=node, predicate=URIRef('http://uri.interlex.org/temp/uris/hasUnit'))) values['unit'] = unit value = g.value(subject=node, predicate=URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#value')) values['value'] = str(value) if values['unit'] == 'dimensionless': log.warning("Measurement with no unit (value: {}) in {}".format(values['value'], dsId)) elif (node, None, URIRef('http://www.w3.org/2000/01/rdf-schema#Datatype')) in g: # Current BNode is a rdfs:Datatype unit = strip_iri(g.value(subject=node, predicate=URIRef('http://www.w3.org/2002/07/owl#onDatatype'))) values['unit'] = strip_iri(unit) value = g.value(subject=node, predicate=URIRef('http://www.w3.org/2002/07/owl#withRestrictions')) # Get Lower Bound Range first = g.value(subject=value, predicate=URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#first')) min_incl = g.value(subject=first, predicate=URIRef('http://www.w3.org/2001/XMLSchema#minInclusive')) #Get Higher Bound Range rest = g.value(subject=value, predicate=URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest')) rest_first = g.value(subject=rest, predicate=URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#first')) max_incl = g.value(subject=rest_first, predicate=URIRef('http://www.w3.org/2001/XMLSchema#maxInclusive')) values['value'] = "{}-{}".format(str(min_incl), str(max_incl)) if values['unit'] == 'dimensionless': log.warning("Measurement with no unit (value: {}) in {}".format(values['value'], dsId)) else: log.warning("Encountered a B-Node that is not a measurement in {}".format(dsId)) return values
def getProtocols(gNew, gDelta, output, iriCache): # Iterate over Protocols for s, o in gNew.subject_objects(URIRef('http://uri.interlex.org/temp/uris/hasProtocol')): m = re.search(r".*(?P<ds>N:dataset:[:\w-]+)", s) datasetId = strip_iri(m.group(0).strip()) url = str(o) newEntry = {} for p2, o2 in gDelta.predicate_objects(o): populateValue(gDelta, datasetId, output[datasetId], newEntry, p2, o2, iriCache) if newEntry: output[datasetId]['protocol'][url] = newEntry
def getDatasets(gNew, gDelta, output, iriCache): # Iterate over Datasets for ds in gNew.subjects(RDF.type, URIRef('http://uri.interlex.org/tgbugs/uris/readable/sparc/Dataset')): log.info(ds) m = re.search(r".*(?P<ds>N:dataset:[:\w-]+)", ds) datasetId = strip_iri(m.group(0).strip()) addEntry(output, datasetId) for p, o in gDelta.predicate_objects(ds): if p == URIRef("http://uri.interlex.org/temp/uris/hasAwardNumber"): getAwards(o, datasetId, output) populateValue(gDelta, datasetId, output[datasetId], output[datasetId]['summary'], p, o, iriCache)
def getTags(gNew, gDelta, output, iriCache): # Iterate over Protocols for s, o in gNew.subject_objects(URIRef('http://purl.obolibrary.org/obo/IAO_0000136')): m = re.search(r".*(?P<ds>N:dataset:[:\w-]+)", s) if m: if isinstance(o, term.URIRef): t = iri_lookup(gNew, o, iriCache) if t: tag = t['labels'][0] else: continue else: tag = str(o) datasetId = strip_iri(m.group(0).strip()) if tag not in output[datasetId]['tag']: output[datasetId]['tag'].append(tag)
def getAwards(awardIdURI, dsId, output): # Iterate over awards awardId = strip_iri(awardIdURI) output[dsId]['award'][awardId] = { 'awardId': awardId }