def summarize(self): """ Produce a string report summarizing the contents of the pump :return: the string summary report :rtype: basestring """ from vivopump import make_get_query result = str(datetime.now()) + " Pump Summary for " + self.json_def_filename + "\n" + \ str(datetime.now()) + " Enumerations\n" + dumps(self.enum, indent=4) + "\n" + \ str(datetime.now()) + " Update Definitions\n" + dumps(self.update_def, indent=4) + "\n" + \ str(datetime.now()) + " Get Query\n" + make_get_query(self.update_def) return result
def __do_get(self): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :return: Number of rows of data """ from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv from improve.improve import improve import codecs # Generate the get query, execute the query, shape the query results into the return object query = make_get_query(self.update_def) logger.debug(u"do_get query_parms\n{}".format(self.query_parms)) logger.debug(u"do_get query\n{}".format(query)) result_set = vivo_query(query, self.query_parms) data = make_get_data(self.update_def, result_set) # Write out the file outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + self.update_def['entity_def']['order'] outfile.write(self.inter.join(columns)) # write a header using the inter field separator between column names outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in self.update_def['column_defs']: path = self.update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: logger.warning(u"VIVO has non-unique values for unique path {} at {} values {}". format(name, uri, data[uri][name])) data[uri][name] = {next(iter(data[uri][name]))} # Pick one element from multi-valued set logger.warning(u"Using {}", data[uri][name]) # Handle filters if self.filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = improve(path[len(path) - 1]['object']['filter'], x) if was_string != new_string: logger.debug(u"{} {} {} FILTER IMPROVED {} to {}". format(uri, name, path[len(path) - 1]['object']['filter'], was_string, new_string)) a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: val = self.enum[enum_name]['get'].get(x, '') if val != '': a.add(val) else: logger.warning(u"WARNING: Unable to find {} in {}. Blank substituted in {}". format(x, enum_name, self.out_filename)) data[uri][name] = a # Gather values into a delimited string val = self.intra.join(data[uri][name]) outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(self.inter) outfile.write('\n') outfile.close() # Rewrite the file based on the order_by or uri if none sort_column_name = self.update_def['entity_def'].get('order_by', 'uri') data = read_csv(self.out_filename, delimiter=self.inter) sdata = {} try: order = sorted(data, key=lambda rown: data[rown][sort_column_name]) except KeyError: logger.error(u"{} in order_by not found. No such column name. Sorting by uri.". format(sort_column_name)) order = sorted(data, key=lambda rown: data[rown]['uri']) row = 1 for o in order: sdata[row] = data[o] row += 1 write_csv(self.out_filename, sdata, delimiter=self.inter) return len(data)
def __do_get(self): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :return: Number of rows of data """ from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv import codecs import sys from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \ improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \ improve_org_name # Generate the get query, execute the query, shape the query results into the return object query = make_get_query(self.update_def) if self.verbose: print self.query_parms print query result_set = vivo_query(query, self.query_parms, self.verbose) data = make_get_data(self.update_def, result_set) # Write out the file outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + self.update_def['entity_def']['order'] outfile.write( self.inter.join(columns) ) # write a header using the inter field separator between column names outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in self.update_def['column_defs']: path = self.update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \ data[uri][name] data[uri][name] = { next(iter(data[uri][name])) } # Pick one element from multi-valued set print data[uri][name] # Handle filters if self.filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = eval( path[len(path) - 1]['object']['filter'])(x) if self.verbose and was_string != new_string: print uri, name, path[len(path) - 1]['object'][ 'filter'], "FILTER IMPROVED", was_string, 'to', \ new_string a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: val = self.enum[enum_name]['get'].get(x, '') if val != '': a.add(val) else: print "WARNING: Unable to find ", x, "in", enum_name, \ ". Blank substituted in", self.out_filename data[uri][name] = a # Gather values into a delimited string val = self.intra.join(data[uri][name]) outfile.write( val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(self.inter) outfile.write('\n') outfile.close() # Rewrite the file based on the order_by or uri if none sort_column_name = self.update_def['entity_def'].get('order_by', 'uri') data = read_csv(self.out_filename, delimiter=self.inter) sdata = {} try: order = sorted(data, key=lambda rown: data[rown][sort_column_name]) except KeyError: print >>sys.stderr, "ERROR: ", sort_column_name, \ "in order_by not found. No such column name. Sorting by uri." order = sorted(data, key=lambda rown: data[rown]['uri']) row = 1 for o in order: sdata[row] = data[o] row += 1 write_csv(self.out_filename, sdata, delimiter=self.inter) return len(data)
def __do_get(self): """ Data is queried from VIVO and returned as a tab delimited text file suitable for editing using an editor or spreadsheet, and suitable for use by do_update. :return: Number of rows of data """ from vivopump import vivo_query, make_get_data, unique_path, make_get_query, read_csv, write_csv import codecs import sys from vivopump import improve_title, improve_email, improve_phone_number, improve_date, \ improve_dollar_amount, improve_sponsor_award_id, improve_deptid, improve_display_name, \ improve_org_name # Generate the get query, execute the query, shape the query results into the return object query = make_get_query(self.update_def) if self.verbose: print self.query_parms print query result_set = vivo_query(query, self.query_parms, self.verbose) data = make_get_data(self.update_def, result_set) # Write out the file outfile = codecs.open(self.out_filename, mode='w', encoding='ascii', errors='xmlcharrefreplace') columns = ['uri'] + self.update_def['entity_def']['order'] outfile.write(self.inter.join(columns)) # write a header using the inter field separator between column names outfile.write('\n') for uri in sorted(data.keys()): for name in columns: if name in data[uri]: # Translate VIVO values via enumeration if any if name in self.update_def['column_defs']: path = self.update_def['column_defs'][name] # Warn/correct if path is unique and VIVO is not if unique_path(path) and len(data[uri][name]) > 1: print "WARNING. VIVO has non-unique values for unique path:", name, "at", uri, \ data[uri][name] data[uri][name] = {next(iter(data[uri][name]))} # Pick one element from multi-valued set print data[uri][name] # Handle filters if self.filter and 'filter' in path[len(path) - 1]['object']: a = set() for x in data[uri][name]: was_string = x new_string = eval(path[len(path) - 1]['object']['filter'])(x) if self.verbose and was_string != new_string: print uri, name, path[len(path) - 1]['object'][ 'filter'], "FILTER IMPROVED", was_string, 'to', \ new_string a.add(new_string) data[uri][name] = a # Handle enumerations if 'enum' in path[len(path) - 1]['object']: enum_name = path[len(path) - 1]['object']['enum'] a = set() for x in data[uri][name]: val = self.enum[enum_name]['get'].get(x, '') if val != '': a.add(val) else: print "WARNING: Unable to find ", x, "in", enum_name, \ ". Blank substituted in", self.out_filename data[uri][name] = a # Gather values into a delimited string val = self.intra.join(data[uri][name]) outfile.write(val.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')) if name != columns[len(columns) - 1]: outfile.write(self.inter) outfile.write('\n') outfile.close() # Rewrite the file based on the order_by or uri if none sort_column_name = self.update_def['entity_def'].get('order_by', 'uri') data = read_csv(self.out_filename, delimiter=self.inter) sdata = {} try: order = sorted(data, key=lambda rown: data[rown][sort_column_name]) except KeyError: print >>sys.stderr, "ERROR: ", sort_column_name, \ "in order_by not found. No such column name. Sorting by uri." order = sorted(data, key=lambda rown: data[rown]['uri']) row = 1 for o in order: sdata[row] = data[o] row += 1 write_csv(self.out_filename, sdata, delimiter=self.inter) return len(data)