def get_step_triples(update_graph, uri, step_def, debug=True): """ Return the triples matching the criteria defined in the current step of an update :param update_graph: the update graph :param uri: uri of the entity currently the subject of an update :param step_def: step definition from update_def :return: Graph containing one or more triples that match the criteria for the step """ from rdflib import Graph from vivopump import vivo_query, add_qualifiers, make_rdf_term if 'qualifier' not in step_def['object']: g = update_graph.triples((uri, step_def['predicate']['ref'], None)) else: q = 'select (?' + step_def['object']['name'] +' as ?o) where { <' + str(uri) + '> <' + \ str(step_def['predicate']['ref']) + '> ?' + step_def['object']['name'] + ' .\n' + \ add_qualifiers([step_def]) + ' }\n' if debug: print "\nStep Triples Query\n", q result_set = vivo_query(q) g = Graph() for binding in result_set['results']['bindings']: o = make_rdf_term(binding['o']) g.add((uri, step_def['predicate']['ref'], o)) if debug: print "Step Triples", len(g) return g
def test_vivo_query(self): result = vivo_query(""" SELECT ?label WHERE { <http://vivo.school.edu/individual/n1133> rdfs:label ?label } """, debug=True) print result self.assertTrue(len(result) > 0)
def test_bad_request(self): from SPARQLWrapper import SPARQLExceptions with self.assertRaises(SPARQLExceptions.QueryBadFormed): result = vivo_query(""" SEWECT ?label WHERE { <http://vivo.ufl.edu/individual/n25562> rdfs:label ?label } """, debug=True) print result
def find_author(author): """ Given an author object with name parts, return the smallest set of uris that match the author in VIVO. Could be an empty set, could be a singleton, could be a set requiring further disambiguation """ from vivopump import vivo_query case = author_case(author) queries = author_queries(case, author) author_uri_set = set([]) for query in queries: result = vivo_query(query.encode('utf-8')) count = len(result['results']['bindings']) if count == 1: author_uri_set = set([result['results']['bindings'][0] \ ['uri']['value']]) break elif 1 < count < len(author_uri_set): author_uri_set = set([]) for row in result['results']['bindings']: author_uri_set.add(row['uri']['value']) return author_uri_set
def get_vivo_academic_articles(parms): """ Query VIVO and return a list of all the academic articles. @see uf_examples/publications/filters/pub_match_filter.py @see https://wiki.duraspace.org/display/VIVO/VIVO-ISF+1.6+relationship+diagrams%3A+Authorship :param: parms: vivo_query params :return: dictionary of uri keyed by DOI """ query = """ SELECT ?uri ?doi WHERE { ?uri a vivo:InformationResource . ?uri bibo:doi ?doi . } """ results = vivo_query(query, parms) bindings = results['results']['bindings'] doi_list = [b['doi']['value'] for b in bindings] uri_list = [b['uri']['value'] for b in bindings] return dict(zip(doi_list, uri_list))
def get_person_vivo_pmids(uri, query_parms): """ Given the uri of a person, query VIVO to get a list of the person's publications with pmids :param uri: :return: a dictionary keyed by pmid with uris of the pubs for each pmid """ from pump.vivopump import vivo_query query = """SELECT (MAX(?paper_uri) AS ?puri) ?pmid WHERE { <{}> vivo:relatedBy ?a . ?a a vivo:Authorship . ?a vivo:relates ?paper_uri . ?paper_uri a bibo:AcademicArticle . ?paper_uri bibo:pmid ?pmid . } GROUP BY ?pmid """ query = query.replace('{}', uri) a = vivo_query(query, query_parms) pmid = [x['pmid']['value'] for x in a['results']['bindings']] puri = [x['puri']['value'] for x in a['results']['bindings']] return dict(zip(pmid, puri))
def get_person_catalyst_pmids(uri, query_parms): """ Given a person uri, collect the attributes needed to call get_pmids and return two lists: a list of pubs for the person found in VIVO, and a list of pubs for the person found by the catalyst service :param uri: the uri of a person in VIVO :return: A dictionary of two lists, the vivo_pmids and the catalyst_pmids """ from vivopump import vivo_query query = """ SELECT ?first ?middle ?last ?email ?affiliation WHERE { <{}> } """ query = query.format(uri) a = vivo_query(query, query_parms) first = a['results']['bindings'][0]['first']['value'] middle = None last = None emails = None affiliations = None return get_catalyst_pmids(first, middle, last, emails, affiliations)
def _get_step_triples(self, uri, step_def): """ Return the triples matching the criteria defined in the current step of an update :param uri: uri of the entity currently the subject of an update :param step_def: step definition from update_def :return: Graph containing zero or more triples that match the criteria for the step """ from rdflib import Graph, RDF from vivopump import add_qualifiers, vivo_query, make_rdf_term def step_graph(uris, pred, otype=None, graph=self.update_graph): """ Given a list of uri, a pred and a type, return a graph of the update_graph triples satisfying uri pred any <- these are the returned triples any a type :param uris: list of uris. :param pred: the predicate to use in selecting triples for the step_graph :param otype: the object type to use. default in None, and no type selection will be done. :param graph: default is update_graph. Closure sieve requires original_graph :return: graph """ sg = Graph() for suri in uris: for obj in graph.objects(suri, pred): if otype is None: sg.add((suri, pred, obj)) elif (obj, RDF.type, otype) in self.update_graph: sg.add((suri, pred, obj)) return sg def sieve_triples(sgc, column_name): """ Given a step graph of triples from a closure (sgc), and the current column_name, select the triples from the closure graph that have a path from the entity_uri to one or more objects in the closure. If there is no path, return an empty graph. :param sgc: the step closure graph to be "sieved" :param column_name: the name of the column to use :return: the sieved closure graph """ print "\nBeginning Closure Graph for", column_name for (s, p, o) in sgc.triples((None, None, None)): print s, p, o if len(sgc) == 0: return sgc # Nothing to sieve else: pred = self.update_def['column_defs'][column_name][0]['predicate']['ref'] otype = self.update_def['column_defs'][column_name][0]['object'].get('type', None) sg = step_graph([self.entity_uri], pred, otype, graph=self.original_graph) if len(sg) == 0 or len(self.update_def['column_defs'][column_name]) == 1: return sg print "step 0 graph" for (s, p, o) in sg.triples((None, None, None)): print s, p, o for step in self.update_def['column_defs'][column_name][1:]: sg = step_graph([y for y in sg.objects(None, None)], step['predicate']['ref'], step['object'].get('type', None), graph=self.original_graph) print "next step graph" for (s, p, o) in sg.triples((None, None, None)): print s, p, o if len(sg) == 0: return sg # column path is empty, so nothing in the closure can match # Wait for it .... Here's the sieve. Return triples in the closure graph that have # objects on the column graph sgr = Graph() for (sgcs, sgcp, sgco) in sgc.triples((None, None, None)): if sgco in sg.objects(None, None): sgr.add((sgcs, sgcp, sgco)) print "reduced step graph" for (s, p, o) in sgr.triples((None, None, None)): print s, p, o return sgr if 'qualifier' not in step_def['object']: g = step_graph([uri], step_def['predicate']['ref'], step_def['object'].get('type', None)) # print "\nStep_triples for", step_def['column_name'], [uri], # step_def['predicate']['ref'], step_def['object'].get('type', None) for (s, p, o) in g.triples((None, None, None)): print unicode(s), unicode(p), unicode(o) # If the step_def is in a closure, and its the last step in the closure, then the # closure triples must be sieved against the objects defined by the column. if step_def['closure'] and step_def['last']: g = sieve_triples(g, step_def['column_name']) else: # Handle non-specific predicates qualified by SPARQL (a rare case for VIVO-ISF) q = 'select (?' + step_def['object']['name'] + ' as ?o) where { <' + str(uri) + '> <' + \ str(step_def['predicate']['ref']) + '> ?' + step_def['object']['name'] + ' . \n' + \ add_qualifiers([step_def]) + ' }\n' logger.debug(u"Qualified Step Triples Query {}".format(q)) result_set = vivo_query(q, self.query_parms) # SLOW g = Graph() for binding in result_set['results']['bindings']: o = make_rdf_term(binding['o']) g.add((uri, step_def['predicate']['ref'], o)) logger.debug(u"Step Triples {}".format(g.serialize(format='nt'))) return g
def __do_get(self): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :return: Number of rows of data """ from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv from improve.improve import improve import codecs # Generate the get query, execute the query, shape the query results into the return object query = make_get_query(self.update_def) logger.debug(u"do_get query_parms\n{}".format(self.query_parms)) logger.debug(u"do_get query\n{}".format(query)) result_set = vivo_query(query, self.query_parms) data = make_get_data(self.update_def, result_set) # Write out the file outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + self.update_def['entity_def']['order'] outfile.write(self.inter.join(columns)) # write a header using the inter field separator between column names outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in self.update_def['column_defs']: path = self.update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}". format(name, uri, data[uri][name])) data[uri][name] = {next(iter(data[uri][name]))} # Pick one element from multi-valued set logger.warning(u"Using {}", data[uri][name]) # Handle filters if self.filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = improve(path[len(path) - 1]['object']['filter'], x) if was_string != new_string: logger.debug(u"{} {} {} FILTER IMPROVED {} to {}". format(uri, name, path[len(path) - 1]['object']['filter'], was_string, new_string)) a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: val = self.enum[enum_name]['get'].get(x, '') if val != '': a.add(val) else: logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}". format(x, enum_name, self.out_filename)) data[uri][name] = a # Gather values into a delimited string val = self.intra.join(data[uri][name]) outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(self.inter) outfile.write('\n') outfile.close() # Rewrite the file based on the order_by or uri if none sort_column_name = self.update_def['entity_def'].get('order_by', 'uri') data = read_csv(self.out_filename, delimiter=self.inter) sdata = {} try: order = sorted(data, key=lambda rown: data[rown][sort_column_name]) except KeyError: logger.error(u"{} in order_by not found. No such column name. Sorting by uri.". format(sort_column_name)) order = sorted(data, key=lambda rown: data[rown]['uri']) row = 1 for o in order: sdata[row] = data[o] row += 1 write_csv(self.out_filename, sdata, delimiter=self.inter) return len(data)
def do_get(update_def, enum, filename, inter='\t', intra=';', do_filter=True, debug=True): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :param filename: Tab delimited file of data from VIVO :param: do_filter: boolean if True do the filters, otherwise do not apply filters :return: Number of rows of data """ from vivopump import vivo_query import codecs from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \ improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name query = make_get_query(update_def) if debug: print query result_set = vivo_query(query, debug=debug) data = make_get_data(update_def, result_set) # Write out the file outfile = codecs.open(filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + update_def['entity_def']['order'] outfile.write(inter.join(columns)) outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in update_def['column_defs']: path = update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, data[uri][name] data[uri][name] = {next(iter(data[uri][name]))} # Pick one element from the multi-valued set print data[uri][name] # Handle filters if do_filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = eval(path[len(path) - 1]['object']['filter'])(x) if debug and was_string != new_string: print uri, name, path[len(path) - 1]['object'][ 'filter'], "FILTER IMPROVED", was_string, 'to', \ new_string a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: a.add(enum[enum_name]['get'].get(x, x)) # if we can't find the value in the # enumeration, just return the value data[uri][name] = a # Gather values into a delimited string val = intra.join(data[uri][name]) outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(inter) outfile.write('\n') outfile.close() return len(data)
def __do_get(self): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :return: Number of rows of data """ from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv import codecs import sys from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \ improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \ improve_org_name # Generate the get query, execute the query, shape the query results into the return object query = make_get_query(self.update_def) if self.verbose: print self.query_parms print query result_set = vivo_query(query, self.query_parms, self.verbose) data = make_get_data(self.update_def, result_set) # Write out the file outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + self.update_def['entity_def']['order'] outfile.write( self.inter.join(columns) ) # write a header using the inter field separator between column names outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in self.update_def['column_defs']: path = self.update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \ data[uri][name] data[uri][name] = { next(iter(data[uri][name])) } # Pick one element from multi-valued set print data[uri][name] # Handle filters if self.filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = eval( path[len(path) - 1]['object']['filter'])(x) if self.verbose and was_string != new_string: print uri, name, path[len(path) - 1]['object'][ 'filter'], "FILTER IMPROVED", was_string, 'to', \ new_string a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: val = self.enum[enum_name]['get'].get(x, '') if val != '': a.add(val) else: print "WARNING: Unable to find ", x, "in", enum_name, \ ". Blank substituted in", self.out_filename data[uri][name] = a # Gather values into a delimited string val = self.intra.join(data[uri][name]) outfile.write( val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(self.inter) outfile.write('\n') outfile.close() # Rewrite the file based on the order_by or uri if none sort_column_name = self.update_def['entity_def'].get('order_by', 'uri') data = read_csv(self.out_filename, delimiter=self.inter) sdata = {} try: order = sorted(data, key=lambda rown: data[rown][sort_column_name]) except KeyError: print >>sys.stderr, "ERROR: ", sort_column_name, \ "in order_by not found. No such column name. Sorting by uri." order = sorted(data, key=lambda rown: data[rown]['uri']) row = 1 for o in order: sdata[row] = data[o] row += 1 write_csv(self.out_filename, sdata, delimiter=self.inter) return len(data)
def do_get(update_def, enum, filename, inter='\t', intra=';', do_filter=True, debug=True): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :param filename: Tab delimited file of data from VIVO :param: do_filter: boolean if True do the filters, otherwise do not apply filters :return: Number of rows of data """ from vivopump import vivo_query import codecs from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \ improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name query = make_get_query(update_def) if debug: print query result_set = vivo_query(query, debug=debug) data = make_get_data(update_def, result_set) # Write out the file outfile = codecs.open(filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + update_def['entity_def']['order'] outfile.write(inter.join(columns)) outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in update_def['column_defs']: path = update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, data[ uri][name] data[uri][name] = { next(iter(data[uri][name])) } # Pick one element from the multi-valued set print data[uri][name] # Handle filters if do_filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = eval(path[len(path) - 1]['object']['filter'])(x) if debug and was_string != new_string: print uri, name, path[len(path) - 1]['object'][ 'filter'], "FILTER IMPROVED", was_string, 'to', \ new_string a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: a.add(enum[enum_name]['get'].get( x, x)) # if we can't find the value in the # enumeration, just return the value data[uri][name] = a # Gather values into a delimited string val = intra.join(data[uri][name]) outfile.write( val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(inter) outfile.write('\n') outfile.close() return len(data)
def __do_get(self): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :return: Number of rows of data """ from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv import codecs import sys from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \ improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \ improve_org_name # Generate the get query, execute the query, shape the query results into the return object query = make_get_query(self.update_def) if self.verbose: print self.query_parms print query result_set = vivo_query(query, self.query_parms, self.verbose) data = make_get_data(self.update_def, result_set) # Write out the file outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + self.update_def['entity_def']['order'] outfile.write(self.inter.join(columns)) # write a header using the inter field separator between column names outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in self.update_def['column_defs']: path = self.update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \ data[uri][name] data[uri][name] = {next(iter(data[uri][name]))} # Pick one element from multi-valued set print data[uri][name] # Handle filters if self.filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = eval(path[len(path) - 1]['object']['filter'])(x) if self.verbose and was_string != new_string: print uri, name, path[len(path) - 1]['object'][ 'filter'], "FILTER IMPROVED", was_string, 'to', \ new_string a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: val = self.enum[enum_name]['get'].get(x, '') if val != '': a.add(val) else: print "WARNING: Unable to find ", x, "in", enum_name, \ ". Blank substituted in", self.out_filename data[uri][name] = a # Gather values into a delimited string val = self.intra.join(data[uri][name]) outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(self.inter) outfile.write('\n') outfile.close() # Rewrite the file based on the order_by or uri if none sort_column_name = self.update_def['entity_def'].get('order_by', 'uri') data = read_csv(self.out_filename, delimiter=self.inter) sdata = {} try: order = sorted(data, key=lambda rown: data[rown][sort_column_name]) except KeyError: print >>sys.stderr, "ERROR: ", sort_column_name, \ "in order_by not found. No such column name. Sorting by uri." order = sorted(data, key=lambda rown: data[rown]['uri']) row = 1 for o in order: sdata[row] = data[o] row += 1 write_csv(self.out_filename, sdata, delimiter=self.inter) return len(data)