def make_authorship_rdf(pub_uri, author_uri, rank, corresponding=False): """ Given data values, create the RDF for an authorship """ from vivopump import new_uri ardf = "" authorship_uri = new_uri() add = assert_resource_property(authorship_uri, "rdf:type", untag_predicate("owl:Thing")) ardf = ardf + add add = assert_resource_property(authorship_uri, "rdf:type", untag_predicate("vivo:Authorship")) ardf = ardf + add add = assert_resource_property(authorship_uri, "vivo:linkedAuthor", author_uri) ardf = ardf + add add = assert_resource_property(authorship_uri, "vivo:linkedInformationResource", pub_uri) ardf = ardf + add add = assert_data_property(authorship_uri, "vivo:authorRank", rank) ardf = ardf + add add = assert_data_property(authorship_uri, "vivo:isCorrespondingAuthor", str(corresponding).lower()) ardf = ardf + add return [ardf, authorship_uri]
def do_two_step_update(row, column_name, uri, uri_prefix, column_def, data_update, intra, enum, update_graph, debug=False): """ In a two step update, identify intermediate entity that might need to be created, and end path objects that might not yet exist or might need to be created. Cases are: Predicate Single Predicate Multiple VIVO has 0 values Add, do_the Add intermediate, do_the VIVO has 1 value do_the Set compare through intermediate VIVO has >1 value WARNING, do_the Set compare through intermediate :return: alterations in update graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri step_def = column_def[0] # Find all the intermediate entities in VIVO and then process cases related to count and defs step_uris = [o for s, p, o in get_step_triples(update_graph, uri, step_def, debug)] if len(step_uris) == 0: # VIVO has no values for intermediate, so add a new intermediate and do_the_update on the leaf step_uri = URIRef(new_uri(uri_prefix)) update_graph.add((uri, step_def['predicate']['ref'], step_uri)) update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) uri = step_uri step_def = column_def[1] vivo_objs = {unicode(o): o for s, p, o in get_step_triples(update_graph, uri, step_def)} column_values = prepare_column_values(data_update[column_name], intra, step_def, enum, row, column_name) do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, update_graph, debug=debug) elif step_def['predicate']['single']: # VIVO has 1 or more values, so we need to see if the predicate is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: print "WARNING: Single predicate", column_name, "has", len(step_uris), "values: ", \ step_uris, "using", step_uri uri = step_uri step_def = column_def[1] vivo_objs = {unicode(o): o for s, p, o in get_step_triples(update_graph, uri, step_def)} column_values = prepare_column_values(data_update[column_name], intra, step_def, enum, row, column_name) do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, update_graph, debug=debug) else: # TODO: Implement set compare through multiple intermediate case -- medium print "WARNING: Updating multi-valued multi-step predicates such as ", column_name, " not yet implemented" return None
def __do_three_step_update(self, row, column_name, uri, path, data_update): """ Given the current state in the update, and a path length three column_def, add, change or delete intermediate and end objects as necessary to perform the requested update :param row: row number of the update. For printing :param column_name: column_name of the update. For printing :param uri: uri of the entity at the head of the path :param path: the column definition :param data_update: the data provided for the update :return: Changes in the update_graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples step_def = path[0] step_uris = [ o for s, p, o in get_step_triples(self.update_graph, uri, column_name, step_def, self.query_parms, self.verbose) ] if len(step_uris) == 0: # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add( (uri, step_def['predicate']['ref'], step_uri)) self.update_graph.add( (step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add( (step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) elif step_def['predicate']['single'] == True: # VIVO has 1 or more values for first intermediate, so we need to see if the predicate # is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: print "WARNING: Single predicate", path[0]['object']['name'], "has", len(step_uris), "values: ", \ step_uris, "using", step_uri self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) return None
def do_three_step_update(row, column_name, uri, uri_prefix, path, data_update, intra, enum, update_graph, debug=False): """ Given the current state in the update, and a path length three column_def, ad, change or delete intermediate and end objects as necessary to perform the requested update :param row: row number of the update. For printing :param column_name: column_name of the update. For printing :param uri: uri of the entity at the head of the path :param path: the column definition :param data_update: the data provided for the update :param enum: the enumerations :param update_graph: the update graph :param debug: debug status. For printing. :return: Changes in the update_graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri step_def = path[0] step_uris = [o for s, p, o in get_step_triples(update_graph, uri, step_def, debug)] if len(step_uris) == 0: # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it step_uri = URIRef(new_uri(uri_prefix)) update_graph.add((uri, step_def['predicate']['ref'], step_uri)) update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) do_two_step_update(row, column_name, step_uri, uri_prefix, path[1:], data_update, intra, enum, update_graph, debug=debug) elif step_def['predicate']['single']: # VIVO has 1 or more values for first intermediate, so we need to see if the predicate is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: print "WARNING: Single predicate", path[0]['object']['name'], "has", len(step_uris), "values: ", \ step_uris, "using", step_uri do_two_step_update(row, column_name, step_uri, uri_prefix, path[1:], data_update, intra, enum, update_graph, debug=debug) return None
def test(self): """ Produce a string report regarding testing the configuration of the pump :return: the string test report :rtype: basestring """ from vivopump import new_uri from SPARQLWrapper import SPARQLExceptions import urllib2 result = str(datetime.now()) + " Test results" + "\n" + \ "Update definition\t" + self.json_def_filename + " read.\n" + \ "Source file name\t" + self.out_filename + ".\n" + \ "Enumerations read.\n" + \ "Filters\t" + str(self.filter) + "\n" + \ "Verbose\t" + str(self.verbose) + "\n" + \ "Intra field separator\t" + self.intra + "\n" + \ "Inter field separator\t" + self.inter + "\n" + \ "VIVO SPARQL API URI\t" + self.query_parms['queryuri'] + "\n" + \ "VIVO SPARQL API username\t" + self.query_parms['username'] + "\n" + \ "VIVO SPARQL API password\t" + self.query_parms['password'] + "\n" + \ "VIVO SPARQL API prefix\t" + self.query_parms['prefix'] + "\n" + \ "Prefix for RDF file names\t" + self.rdfprefix + "\n" + \ "Uriprefix for new uri\t" + self.query_parms['uriprefix'] + "\n" try: uri = new_uri(self.query_parms) result += "Sample new uri\t" + uri + "\n" + \ "Simple VIVO is ready for use.\n" except urllib2.HTTPError as herror: result += "Connection to VIVO failed\t" + str(herror) + "\n" + \ "Check your Simple VIVO configuration and your VIVO permissions.\n" except SPARQLExceptions.EndPointNotFound as notfound: result += "Connection to VIVO failed\t" + str(notfound) + "\n" + \ "Check your Simple VIVO configuration and your VIVO API.\n" except urllib2.URLError as uerror: result += "Connection to VIVO failed\t" + str(uerror) + "\n" + \ "Check your Simple VIVO configuration and your VIVO API.\n" except: result += "Connection to VIVO failed\t" + "\n" + \ "Check your Simple VIVO configuration and your VIVO API.\n" result += str(datetime.now()) + " Test end" return result
def __do_three_step_update(self, row, column_name, uri, path, data_update): """ Given the current state in the update, and a path length three column_def, add, change or delete intermediate and end objects as necessary to perform the requested update :param row: row number of the update. For logger messages :param column_name: column_name of the update. For logger messages :param uri: uri of the entity at the head of the path :param path: the column definition :param data_update: the data provided for the update :return: Changes in the update_graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples step_def = path[0] step_uris = [o for s, p, o in get_step_triples(self.update_graph, uri, step_def, self.query_parms)] if len(step_uris) == 0: # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) elif step_def['predicate']['single'] == True: # VIVO has 1 or more values for first intermediate, so we need to see if the predicate # is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: logger.warning(u"WARNING: Single predicate {} has {} values: {}. Using {}". format(path[0]['object']['name'], len(step_uris), step_uris, step_uri)) self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update) return None
def test_parameters(self): uri = new_uri() self.assertTrue(len(uri) > 0)
def get_pubmed(pmid, author_uris=None): """ Given a pubmid identifer, return a structure containing the elements of the publication of interest to VIVO. Optionally, provide a set of author_uris for use in disambiguation. When find_author returns a set of size > 1, the author_uris will be examined for matches to assist with disambiguation. """ from vivopump import new_uri ardf = "" record = get_entrez_record(pmid) if record is None: return ["", None] pub = document_from_pubmed(record) if pub['page_end'] == '': pub['page_end'] = pub['page_start'] if pub['date']['month'] == '': pub['date']['month'] = '1' if pub['date']['day'] == '': pub['date']['day'] = '1' pub['pub_uri'] = new_uri() pub['date_harvested'] = str(datetime.now()) pub['harvested_by'] = "Python PubMed Add " + __version__ journal_uri = find_vivo_uri("bibo:issn", pub['issn']) if journal_uri is None: [add, journal_uri] = make_journal_rdf(pub['journal'], pub['issn']) ardf = ardf + add pub['journal_uri'] = journal_uri pub_date = datetime.strptime(pub['date']['month'] + '/' + pub['date']['day'] + \ '/' + pub['date']['year'], "%m/%d/%Y") if pub_date in date_dictionary: pub['date_uri'] = date_dictionary[pub_date] else: [add, pub_date_uri] = make_datetime_rdf(pub_date.isoformat()) date_dictionary[pub_date] = pub_date_uri pub['date_uri'] = pub_date_uri ardf = ardf + add # Turn each author into a URI reference to an authorship pub['authorship_uris'] = [] for key, author in sorted(pub['authors'].items(), key=lambda x: x[0]): try: author_uri_set = find_author(author) except: print "No last name for author", author print "Pub\n", pub print "Record\n", record continue if len(author_uri_set) == 0: [add, author_uri] = make_author_rdf(author) ardf = ardf + add print pmid, "Add", author, "at", author_uri elif len(author_uri_set) == 1: author_uri = list(author_uri_set)[0] print pmid, "Found", author, author_uri else: if author_uris is None: author_uri = list(author_uri_set)[0] print pmid, "Disambiguate", author, "from", author_uri_set else: possible_uri_set = author_uri_set.intersection(author_uris) if len(possible_uri_set) == 1: author_uri = list(possible_uri_set)[0] else: author_uri = list(possible_uri_set)[0] print pmid, "Disambiguate", author, "from", possible_uri_set print "Disambiguate:" print " Possible authors in VIVO", author_uri_set print " Possible authors in Source", author_uris print " Selected author", author_uri [add, authorship_uri] = make_authorship_rdf(pub['pub_uri'], author_uri, key, corresponding=False) pub['authorship_uris'].append(authorship_uri) ardf = ardf + add return [ardf, pub]
def do_update(self): """ read updates from a spreadsheet filename. Compare to data in VIVO. Generate add and sub rdf as necessary to process requested changes """ from rdflib import URIRef, RDF from vivopump import new_uri for row, data_update in self.update_data.items(): uri = URIRef(data_update['uri']) if 'remove' in data_update.keys() and data_update['remove'].lower( ) == 'true': do_remove(row, uri, self.update_graph, self.verbose) continue if (uri, None, None) not in self.update_graph: # If the entity uri can not be found in the update graph, make a new URI ignoring the one in the # spreadsheet, if any, and add the URI to the update graph. Remaining processing is unchanged. # Since the new uri does not have triples for the columns in the spreadsheet, each will be added uri_string = new_uri(self.uri_prefix) if self.verbose: print "Adding an entity for row", row, ". Will be added at", uri_string uri = URIRef(uri_string) self.update_graph.add( (uri, RDF.type, self.update_def['entity_def']['type'])) entity_uri = uri for column_name, column_def in self.update_def[ 'column_defs'].items(): if column_name not in data_update: continue # extra column names are allowed in the spreadsheet for annotation uri = entity_uri if data_update[column_name] == '': continue if len(column_def) > 3: raise PathLengthException( "Path lengths > 3 not supported. Path length for " + column_name + " is " + str(len(column_def))) elif len(column_def) == 3: do_three_step_update(row, column_name, uri, self.uri_prefix, column_def, data_update, self.intra, self.enum, self.update_graph, debug=False) elif len(column_def) == 2: do_two_step_update(row, column_name, uri, self.uri_prefix, column_def, data_update, self.intra, self.enum, self.update_graph, debug=False) elif len(column_def) == 1: step_def = column_def[0] vivo_objs = {} for s, p, o in self.update_graph.triples( (uri, step_def['predicate']['ref'], None)): vivo_objs[unicode(o)] = o column_values = prepare_column_values( data_update[column_name], self.intra, step_def, self.enum, row, column_name) if self.verbose: print row, column_name, column_values, uri, vivo_objs do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, self.update_graph, debug=self.verbose) # Return the add and sub graphs representing the changes that need to be made to the original add = self.update_graph - self.original_graph # Triples in update that are not in original if self.verbose: print "Triples to add" print add.serialize(format='nt') sub = self.original_graph - self.update_graph # Triples in original that are not in update if self.verbose: print "Triples to sub" print sub.serialize(format='nt') return [add, sub]
def do_two_step_update(row, column_name, uri, uri_prefix, column_def, data_update, intra, enum, update_graph, debug=False): """ In a two step update, identify intermediate entity that might need to be created, and end path objects that might not yet exist or might need to be created. Cases are: Predicate Single Predicate Multiple VIVO has 0 values Add, do_the Add intermediate, do_the VIVO has 1 value do_the Set compare through intermediate VIVO has >1 value WARNING, do_the Set compare through intermediate :return: alterations in update graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri step_def = column_def[0] # Find all the intermediate entities in VIVO and then process cases related to count and defs step_uris = [ o for s, p, o in get_step_triples(update_graph, uri, step_def, debug) ] if len(step_uris) == 0: # VIVO has no values for intermediate, so add a new intermediate and do_the_update on the leaf step_uri = URIRef(new_uri(uri_prefix)) update_graph.add((uri, step_def['predicate']['ref'], step_uri)) update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: update_graph.add( (step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) uri = step_uri step_def = column_def[1] vivo_objs = { unicode(o): o for s, p, o in get_step_triples(update_graph, uri, step_def) } column_values = prepare_column_values(data_update[column_name], intra, step_def, enum, row, column_name) do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, update_graph, debug=debug) elif step_def['predicate']['single']: # VIVO has 1 or more values, so we need to see if the predicate is expected to be single step_uri = step_uris[0] if len(step_uris) > 1: print "WARNING: Single predicate", column_name, "has", len(step_uris), "values: ", \ step_uris, "using", step_uri uri = step_uri step_def = column_def[1] vivo_objs = { unicode(o): o for s, p, o in get_step_triples(update_graph, uri, step_def) } column_values = prepare_column_values(data_update[column_name], intra, step_def, enum, row, column_name) do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, update_graph, debug=debug) else: # TODO: Implement set compare through multiple intermediate case -- medium print "WARNING: Updating multi-valued multi-step predicates such as ", column_name, " not yet implemented" return None
def __do_update(self): """ For each row, process each column. Compare to data in VIVO. Generate add and sub rdf as necessary to process requested add, change, delete """ from rdflib import URIRef, RDF from vivopump import new_uri, prepare_column_values, get_step_triples, PathLengthException merges = {} for row, data_update in self.update_data.items(): # Create a URI if empty logger.debug("data_update[uri] = {}".format(data_update['uri'])) if data_update['uri'].strip() == '': dict_is_empty = True for item in data_update.values(): if len(item) != 0: dict_is_empty = False if dict_is_empty: # skip blank lines in the input file continue # If the source uri is empty, create one. Remaining processing is unchanged. # Since the new uri does not have triples for the columns in the spreadsheet, each will be added uri_string = new_uri(self.query_parms) logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, uri_string)) uri = URIRef(uri_string) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) # Create a URI entity if not found else: uri = URIRef(data_update['uri'].strip()) if (uri, None, None) not in self.update_graph: logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, str(uri))) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) entity_uri = uri action = data_update.get('action', '').lower() # Process remove action if any if action == 'remove': self.__do_remove(row, uri) continue # Collect merge info if any if action != '': k = action.find('1') if k > -1: key = action[0:k] if key not in merges: merges[key] = {} merges[key]['primary'] = None merges[key]['secondary'] = [uri] else: merges[key]['secondary'].append(uri) else: if action not in merges: merges[action] = {} merges[action]['primary'] = uri if 'secondary' not in merges[action]: merges[action]['secondary'] = [] # For this row, process all the column_defs and then process closure defs if any. Closures allow # columns to be "reused" providing additional paths from the row entity to entities in the paths. for column_name, column_def in self.update_def['column_defs'].items() + \ self.update_def.get('closure_defs', {}).items(): if column_name not in data_update: continue # extra column names are allowed in the spreadsheet for annotation uri = entity_uri if data_update[column_name] == '': logger.debug(u"Skipping blank value. row {} column {}".format(row, column_name)) continue column_def_len = len(column_def) logger.debug("column_def length is: {}".format(column_def_len)) if column_def_len > 3: raise PathLengthException( "ERROR: Path lengths > 3 not supported. Path length for {} is {}" .format(column_name, column_def_len)) elif column_def_len == 3: self.__do_three_step_update(row, column_name, uri, column_def, data_update) elif column_def_len == 2: self.__do_two_step_update(row, column_name, uri, column_def, data_update) elif column_def_len == 1: step_def = column_def[0] vivo_objs = {unicode(o): o for s, p, o in get_step_triples(self.update_graph, uri, step_def, self.query_parms)} column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum, row, column_name) logger.debug(u"{} {} {} {} {}".format(row, column_name, column_values, uri, vivo_objs)) self.__do_the_update(row, column_name, uri, step_def, column_values, vivo_objs) if any(merges): self.__do_merges(merges) # Return the add and sub graphs representing the changes that need to be made to the original add = self.update_graph - self.original_graph # Triples in update that are not in original logger.info(u"Triples to add\n{}".format(add.serialize(format='nt'))) sub = self.original_graph - self.update_graph # Triples in original that are not in update logger.info(u"Triples to sub\n{}".format(sub.serialize(format='nt'))) return [add, sub]
def do_update(self): """ read updates from a spreadsheet filename. Compare to data in VIVO. Generate add and sub rdf as necessary to process requested changes """ from rdflib import URIRef, RDF from vivopump import new_uri for row, data_update in self.update_data.items(): uri = URIRef(data_update['uri']) if 'remove' in data_update.keys() and data_update['remove'].lower() == 'true': do_remove(row, uri, self.update_graph, self.verbose) continue if (uri, None, None) not in self.update_graph: # If the entity uri can not be found in the update graph, make a new URI ignoring the one in the # spreadsheet, if any, and add the URI to the update graph. Remaining processing is unchanged. # Since the new uri does not have triples for the columns in the spreadsheet, each will be added uri_string = new_uri(self.uri_prefix) if self.verbose: print "Adding an entity for row", row, ". Will be added at", uri_string uri = URIRef(uri_string) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) entity_uri = uri for column_name, column_def in self.update_def['column_defs'].items(): if column_name not in data_update: continue # extra column names are allowed in the spreadsheet for annotation uri = entity_uri if data_update[column_name] == '': continue if len(column_def) > 3: raise PathLengthException( "Path lengths > 3 not supported. Path length for " + column_name + " is " + str( len(column_def))) elif len(column_def) == 3: do_three_step_update(row, column_name, uri, self.uri_prefix, column_def, data_update, self.intra, self.enum, self.update_graph, debug=False) elif len(column_def) == 2: do_two_step_update(row, column_name, uri, self.uri_prefix, column_def, data_update, self.intra, self.enum, self.update_graph, debug=False) elif len(column_def) == 1: step_def = column_def[0] vivo_objs = {} for s, p, o in self.update_graph.triples((uri, step_def['predicate']['ref'], None)): vivo_objs[unicode(o)] = o column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum, row, column_name) if self.verbose: print row, column_name, column_values, uri, vivo_objs do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, self.update_graph, debug=self.verbose) # Return the add and sub graphs representing the changes that need to be made to the original add = self.update_graph - self.original_graph # Triples in update that are not in original if self.verbose: print "Triples to add" print add.serialize(format='nt') sub = self.original_graph - self.update_graph # Triples in original that are not in update if self.verbose: print "Triples to sub" print sub.serialize(format='nt') return [add, sub]
def test_new_uri_prefix(self): uri = new_uri(uri_prefix='http://my.vivo.edu/individual/') print uri self.assertTrue(uri.startswith('http://my.vivo.edu'))
def __do_update(self): """ For each row, process each column. Compare to data in VIVO. Generate add and sub rdf as necessary to process requested add, change, delete """ from rdflib import URIRef, RDF from vivopump import new_uri, prepare_column_values, PathLengthException merges = {} for row, data_update in self.update_data.items(): # Create a URI if empty if data_update['uri'].strip() == '': # If the source uri is empty, create one. Remaining processing is unchanged. # Since the new uri does not have triples for the columns in the spreadsheet, each will be added uri_string = new_uri(self.query_parms) logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, uri_string)) uri = URIRef(uri_string) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) # Create a URI entity if not found else: uri = URIRef(data_update['uri'].strip()) if (uri, None, None) not in self.update_graph: logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, str(uri))) self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type'])) self.entity_uri = uri action = data_update.get('action', '').lower() # Process remove action if any if action == 'remove': self.__do_remove(row, uri) continue # Collect merge info if any if action != '': k = action.find('1') if k > -1: key = action[0:k] if key not in merges: merges[key] = {} merges[key]['primary'] = None merges[key]['secondary'] = [uri] else: merges[key]['secondary'].append(uri) else: if action not in merges: merges[action] = {} merges[action]['primary'] = uri if 'secondary' not in merges[action]: merges[action]['secondary'] = [] # For this row, process all the column_defs and then process closure defs if any. Closures allow # columns to be "reused" providing additional paths from the row entity to entities in the paths. for column_name, column_def in self.update_def['column_defs'].items() + \ self.update_def.get('closure_defs', {}).items(): # Skip any columns in the data that are not in the update_def if column_name not in data_update: continue # Skip the column if it is empty if data_update[column_name] == '': logger.debug(u"Skipping blank value. row {} column {}".format(row, column_name)) continue # Process the column values, returning a list of RDF elements last_def = column_def[len(column_def) - 1] column_values = prepare_column_values(data_update[column_name], self.intra, last_def, self.enum, row, column_name) # Process the path depending on its length. Some day we will refactor this to a recursion if len(column_def) > 3: raise PathLengthException( "ERROR: Path lengths > 3 not supported. Path length for " + column_name + " is " + str( len(column_def))) elif len(column_def) == 3: self.__do_three_step_update(row, column_name, self.entity_uri, column_def, data_update) elif len(column_def) == 2: self.__do_two_step_update(row, column_name, self.entity_uri, column_def, data_update) elif len(column_def) == 1: vivo_objs = {unicode(o): o for s, p, o in self._get_step_triples(self.entity_uri, last_def)} logger.debug(u"{} {} {} {} {}".format(row, column_name, column_values, self.entity_uri, vivo_objs)) self.__do_the_update(row, column_name, self.entity_uri, last_def, column_values, vivo_objs) if any(merges): self.__do_merges(merges) # Return the add and sub graphs representing the changes that need to be made to the original add = self.update_graph - self.original_graph # Triples in update that are not in original logger.info(u"Triples to add\n{}".format(add.serialize(format='nt'))) sub = self.original_graph - self.update_graph # Triples in original that are not in update logger.info(u"Triples to sub\n{}".format(sub.serialize(format='nt'))) return [add, sub]
def __do_two_step_update(self, row, column_name, uri, column_def, data_update): """ In a two step update, identify intermediate entity that might need to be created, and end path objects that might not yet exist or might need to be created. Cases are: Predicate Single Predicate Multiple VIVO has 0 values Add, do_the Add intermediate, do_the VIVO has 1 value do_the Set compare through intermediate VIVO has >1 value WARNING, do_the Set compare through intermediate :param: row: current row in spreadsheet :param: column_name: name of current column in spreadsheet :param: uri: uri in VIVO of the current entity :param: column_def: the column def for the current column :param: data_update: the column_value :return: alterations in update graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, get_step_triples, prepare_column_values step_def = column_def[0] # Determine the add set (which intermediates point to column values that are not yet in VIVO # For each element in the add set, construct the intermediate and call __do_the_update to # construct the leaf # Determine the sub set (which intermediates point to column values that are in VIVO and are # not in the column values # For each element in the sub set, remove the leaf and the intermediate # # This framework should also handle single valued predicates, and cases where there are no step_uris. # That is, it should handle everything. All the code below should be replaced. step_uris = [o for s, p, o in get_step_triples(self.update_graph, uri, column_def[0], self.query_parms)] vivo_objs = {} for step_uri in step_uris: for s, p, o in get_step_triples(self.update_graph, step_uri, column_def[1], self.query_parms): vivo_objs[unicode(o)] = [o, step_uri] # Nasty hack below. The predicate property "single" appears to have two meanings. One has to do # with the semantic graph and one has to do with the cardinality of the data column. These are not # the same. When the first step is multiple and the second single, the "second single" is not the # cardinality of the data column. The cardinality of the data column is multiple if any of the # predicates in the path are multiple. Here we set the cardinality of the leaf to be used by # prepare_column_values and then set it back. Nasty. Create a property for leaf cardinality. predicate2_cardinality = column_def[1]['predicate']['single'] if column_def[0]['predicate']['single'] == False: column_def[1]['predicate']['single'] = False column_values = prepare_column_values(data_update[column_name], self.intra, column_def[1], self.enum, row, column_name) column_def[1]['predicate']['single'] = predicate2_cardinality vivo_values = [vivo_objs[x][0] for x in vivo_objs.keys()] if unicode(column_values[0]).lower() == 'none': add_values = set() sub_values = set(vivo_values) else: add_values = set(column_values) - set(vivo_values) sub_values = set(vivo_values) - set(column_values) logger.debug(u"Two step SET COMPARE\n\tRow {}\n\tColumn {}\n\tSource values {}\n\tVIVO values {}" + "\n\tAdd values {}\n\tSub values {}\n\tStep_uris {}". format(row, column_name, column_values, vivo_values, add_values, sub_values, step_uris)) # Process the adds if len(add_values) > 0: if column_def[0]['predicate']['single'] == False: # Multiple intermediaries, single valued-leaves for leaf_value in add_values: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_the_update(row, column_name, step_uri, column_def[1], [leaf_value], {}) else: # Multiple values on the single leaf if len(step_uris) == 0: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) else: step_uri = step_uris[0] self.__do_the_update(row, column_name, step_uri, column_def[1], column_values, {}) # Process the subs if len(sub_values) > 0: if column_def[0]['predicate']['single'] == False: # Handle multiple intermediaries, single leaves, by removing each intermediary and all its # assertions for leaf_value in sub_values: step_uri = vivo_objs[unicode(leaf_value)][1] self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) else: # Handle single intermediary, possibly multiple leaves, by removing each leaf from the intermediary # Then check to see if the intermediary has any remaining leaf assertions and remove if empty step_uri = vivo_objs[unicode(next(iter(sub_values)))][1] for leaf_value in sub_values: self.update_graph.remove((step_uri, None, leaf_value)) g = self.update_graph.triples((step_uri, column_def[1]['predicate']['ref'], None)) if g == set(): self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) return None
def __do_two_step_update(self, row, column_name, uri, column_def, data_update): """ In a two step update, identify intermediate entity that might need to be created, and end path objects that might not yet exist or might need to be created. Cases are: Predicate Single Predicate Multiple VIVO has 0 values Add, do_the Add intermediate, do_the VIVO has 1 value do_the Set compare through intermediate VIVO has >1 value WARNING, do_the Set compare through intermediate :param: row: current row in spreadsheet :param: column_name: name of current column in spreadsheet :param: uri: uri in VIVO of the current entity :param: column_def: the column def for the current column :param: data_update: the column_value :return: alterations in update graph """ from rdflib import RDF, RDFS, Literal, URIRef from vivopump import new_uri, prepare_column_values step_def = column_def[0] # Determine the add set (which intermediates point to column values that are not yet in VIVO # For each element in the add set, construct the intermediate and call __do_the_update to # construct the leaf # Determine the sub set (which intermediates point to column values that are in VIVO and are # not in the column values # For each element in the sub set, remove the leaf and the intermediate # # This framework should also handle single valued predicates, and cases where there are no step_uris. # That is, it should handle everything. All the code below should be replaced. step_uris = [o for s, p, o in self._get_step_triples(uri, column_def[0])] vivo_objs = {} for step_uri in step_uris: for s, p, o in self._get_step_triples(step_uri, column_def[1]): vivo_objs[unicode(o)] = [o, step_uri] # Nasty hack below. The predicate property "single" appears to have two meanings. One has to do # with the semantic graph and one has to do with the cardinality of the data column. These are not # the same. When the first step is multiple and the second single, the "second single" is not the # cardinality of the data column. The cardinality of the data column is multiple if any of the # predicates in the path are multiple. Here we set the cardinality of the leaf to be used by # prepare_column_values and then set it back. Nasty. Create a property for leaf cardinality. predicate2_cardinality = column_def[1]['predicate']['single'] if column_def[0]['predicate']['single'] == False: column_def[1]['predicate']['single'] = False column_values = prepare_column_values(data_update[column_name], self.intra, column_def[1], self.enum, row, column_name) column_def[1]['predicate']['single'] = predicate2_cardinality vivo_values = [vivo_objs[x][0] for x in vivo_objs.keys()] if unicode(column_values[0]).lower() == 'none': add_values = set() sub_values = set(vivo_values) else: add_values = set(column_values) - set(vivo_values) sub_values = set(vivo_values) - set(column_values) logger.debug(u"Two step SET COMPARE\n\tRow {}\n\tColumn {}\n\tSource values {}\n\tVIVO values {}" + "\n\tAdd values {}\n\tSub values {}\n\tStep_uris {}". format(row, column_name, column_values, vivo_values, add_values, sub_values, step_uris)) # Process the adds if len(add_values) > 0: if column_def[0]['predicate']['single'] == False: # Multiple intermediaries, single valued-leaves for leaf_value in add_values: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) self.__do_the_update(row, column_name, step_uri, column_def[1], [leaf_value], {}) else: # Multiple values on the single leaf if len(step_uris) == 0: step_uri = URIRef(new_uri(self.query_parms)) self.update_graph.add((uri, step_def['predicate']['ref'], step_uri)) if 'type' in step_def['object']: self.update_graph.add((step_uri, RDF.type, step_def['object']['type'])) if 'label' in step_def['object']: self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'], datatype=step_def['object'].get('datatype', None), lang=step_def['object'].get('lang', None)))) else: step_uri = step_uris[0] self.__do_the_update(row, column_name, step_uri, column_def[1], column_values, {}) # Process the subs if len(sub_values) > 0: if column_def[0]['predicate']['single'] == False: # Handle multiple intermediaries, single leaves, by removing each intermediary and all its # assertions for leaf_value in sub_values: step_uri = vivo_objs[unicode(leaf_value)][1] self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) else: # Handle single intermediary, possibly multiple leaves, by removing each leaf from the intermediary # Then check to see if the intermediary has any remaining leaf assertions and remove if empty step_uri = vivo_objs[unicode(next(iter(sub_values)))][1] for leaf_value in sub_values: self.update_graph.remove((step_uri, None, leaf_value)) g = self.update_graph.triples((step_uri, column_def[1]['predicate']['ref'], None)) if g == set(): self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri)) self.update_graph.remove((step_uri, None, None)) return None
def test_new_uri_default(self): uri = new_uri() print uri self.assertTrue(len(uri) > 0)