def test_records(): """Test the records function.""" table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = records(table) expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': 2}) ieq(expect, actual)
def test_records_shortrows(): """Test the records function on a table with short rows.""" table = (('foo', 'bar'), ('a', 1), ('b', )) actual = records(table) expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': None}) ieq(expect, actual)
def test_records_shortrows(): """Test the records function on a table with short rows.""" table = (('foo', 'bar'), ('a', 1), ('b',)) actual = records(table) expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': None}) ieq(expect, actual)
def generate_data_types(self, table): # Generate column data types longest, type_list = [], [] cont = petl.records(table.table) # Populate empty values for the columns for col in table.columns: longest.append(0) type_list.append('') for row in cont: for i in range(len(row)): # NA is the csv null value if type_list[i] == 'varchar' or row[i] == 'NA': pass else: var_type = self.data_type(row[i], type_list[i]) type_list[i] = var_type # Calculate width width = len(str(row[i]).encode('utf-8')) if width > longest[i]: longest[i] = width return { 'longest': longest, 'headers': table.columns, 'type_list': type_list }
def generate_data_types(self, table): # Generate column data types longest, type_list = [], [] cont = petl.records(table.table) # Populate empty values for the columns for col in table.columns: longest.append(0) type_list.append('') for row in cont: for i in range(len(row)): # NA is the csv null value if type_list[i] == 'varchar' or row[i] in ['NA', '']: pass else: var_type = self.data_type(row[i], type_list[i]) type_list[i] = var_type # Calculate width if len(str(row[i]).encode('utf-8')) > longest[i]: longest[i] = len(str(row[i]).encode('utf-8')) # In L138 'NA' and '' will be skipped # If the entire column is either one of those (or a mix of the two) # the type will be empty. # Fill with a default varchar type_list = [typ or 'varchar' for typ in type_list] return {'longest': longest, 'headers': table.columns, 'type_list': type_list}
def replace_split_transactions(table, dict_split_transaction_details): global g list_rows_to_add = [] list_rows_to_remove = [] for row in petl.records(table): if row['Account'] == 'Split Transaction': sk_indiv_id = row['SK Individual ID'] batch_date = row['Batch Date'] amount_str = row['SK Amount'] string_key = sk_indiv_id + ',' + batch_date contrib_amount = Decimal(re.sub(r'[^\d.]', '', amount_str)) if string_key in dict_split_transaction_details: splits_total = Decimal(0) for split_entry in dict_split_transaction_details[string_key]: splits_total += Decimal(re.sub(r'[^\d.]', '', split_entry['Amount'])) if contrib_amount != splits_total: print "*** ERROR! For Individual ID, Batch Date " + string_key + ", the main 'Split " \ "Transaction' entry amount was " + str(contrib_amount) + "but sum of split detail " \ "transactions was " + str(splits_total) else: list_rows_to_remove.append({ 'SK Individual ID': sk_indiv_id, 'Batch Date': batch_date, 'SK Amount': amount_str, 'Account': 'Split Transaction' }) for split_entry in dict_split_transaction_details[string_key]: list_rows_to_add.append({ 'Env #': row['Env #'], 'Batch Date': batch_date, 'SK Amount': split_entry['Amount'], 'Type': row['Type'], 'Account': split_entry['Account'], 'Tax': split_entry['Tax'], 'Check #': row['Check #'], 'Notes': "Inserted from 'Split Transaction'. " + row['Notes'], # 'Family ID': row['Family ID'], 'SK Individual ID': row['SK Individual ID'] # 'To Date': row['To Date'], # 'Contribution Link': row['Contribution Link'] }) else: print "*** ERROR! Cannot find any 'Split Transaction' details for record with 'Batch Date' " + \ row['Batch Date'] + ", contributed by 'Individual ID' " + row['SK Individual ID'] + " for the " \ "amount " + row['SK Amount'] print '*** Count before remove_rows(): ' + str(petl.nrows(table)) table = remove_rows(table, list_rows_to_remove) print '*** Count after remove_rows(): ' + str(petl.nrows(table)) print '*** Count before add_rows(): ' + str(petl.nrows(table)) table = add_rows(table, list_rows_to_add) print '*** Count after add_rows(): ' + str(petl.nrows(table)) return table
def main(argv): global full_name2sk_indiv_id parser = argparse.ArgumentParser() parser.add_argument( "--attendance-filename", required=True, nargs="+", action="append", help="Attendance filename (input Servant Keeper attendance report file(s)...can be wildcard)", ) parser.add_argument( "--mapping-filename", required=True, help="'Mapping' filename (CSV mapping file with " "'Last Name', 'Preferred Name' and 'Individual ID' Servant Keeper data columns)", ) args = parser.parse_args() # Load up mapping matrix to map from Servant Keeper full_name's to Servant Keeper individual_id's full_name2sk_indiv_id = {} dup_names = {} ids_in_dups = {} with open(args.mapping_filename, "rb") as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: key = row[0] + ", " + row[1] if key in full_name2sk_indiv_id: if not key in dup_names: dup_names[key] = [full_name2sk_indiv_id[key]] ids_in_dups[full_name2sk_indiv_id[key]] = key dup_names[key].append(row[2]) ids_in_dups[row[2]] = key else: full_name2sk_indiv_id[key] = row[2] attendance_table = join_tables(args.attendance_filename[0], None, False) # print dup_names # print attendance_table reported_dups = set() for row in petl.records(attendance_table): if row["Individual ID"] in ids_in_dups and ids_in_dups[row["Individual ID"]] not in reported_dups: print "Duplicate name: " + ids_in_dups[row["Individual ID"]] reported_dups.add(ids_in_dups[row["Individual ID"]]) sys.stdout.flush() sys.stderr.flush()
def import_file(self, filename, sheet, columns): """ used to import file and return dictionnary of data :param filename: :param sheet: :param columns: :return data: dictionnary """ table = xlsx.fromxlsx(filename, sheet) data = [] for el in petl.records(table): data_child = {} for col in columns: data_child[col] = el[col] data.append(data_child) return data
def gather_semi_sep_by_indiv_id(table, dict_semi_column_fields): dict_id2groups = {} for row in petl.records(table): for col in dict_semi_column_fields: field_list = dict_semi_column_fields[col] value = row[col].strip() if value == '': continue else: sk_indiv_id = row['Individual ID'] for group_name in value.split(';'): group_name_str = group_name.strip() if group_name_str in field_list: if not sk_indiv_id in dict_id2groups: dict_id2groups[sk_indiv_id] = [] dict_id2groups[sk_indiv_id].append(group_name_str) return dict_id2groups
def load_split_transaction_details(list_split_details_files): dict_split_transaction_details = {} if list_split_details_files is not None: for split_details_file in list_split_details_files: assert os.path.isfile(split_details_file), "Error: cannot open file '" + split_details_file + "'" table = petl.fromcsv(split_details_file) account_name = get_account_name(petl.values(table, 'Account')) for row in petl.records(table): if row['Account'] == 'Split Transaction': string_key = row['Individual ID'] + ',' + row['Batch Date'] if string_key not in dict_split_transaction_details: dict_split_transaction_details[string_key] = [] dict_split_transaction_details[string_key].append({ 'Account': account_name, 'Amount': row['Amount'], 'Tax': row['Tax'] }) return dict_split_transaction_details
def valuecounts(table, col_name): return_dict = {} reported_count = 0 unreported_count = 0 column = petl.values(table, col_name) nrows = petl.nrows(table) non_blanks = petl.select(table, '{' + quote_single_quote(col_name) + "} != ''") num_blanks = nrows - petl.nrows(non_blanks) counts_table = petl.valuecounts(non_blanks, col_name) for row in petl.records(counts_table): if row['frequency'] > 0.01: return_dict[row[col_name]] = row['count'] reported_count += row['count'] else: unreported_count += row['count'] return_dict['<other>'] = unreported_count return_dict['<blank>'] = num_blanks return return_dict
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["columns_query_kwarg"] = self.columns_query_kwarg table = petl.fromcsv(self.object.downloaded_file) full_table_header = list(petl.header(table)) context["column_options"] = full_table_header selected_columns = [c for c in self.request.GET.getlist(self.columns_query_kwarg) if c in full_table_header] context["selected_columns"] = selected_columns if selected_columns: context["header"] = selected_columns + ["Count"] context["rows"] = petl.records( petl.aggregate(table, selected_columns[0] if len(selected_columns) == 1 else selected_columns, len) ) return context
def test_records(): table = (("foo", "bar"), ("a", 1), ("b", 2)) actual = records(table) # access items it = iter(actual) o = it.next() eq_("a", o["foo"]) eq_(1, o["bar"]) o = it.next() eq_("b", o["foo"]) eq_(2, o["bar"]) # access attributes it = iter(actual) o = it.next() eq_("a", o.foo) eq_(1, o.bar) o = it.next() eq_("b", o.foo) eq_(2, o.bar)
def test_records_unevenrows(): table = (("foo", "bar"), ("a", 1, True), ("b",)) actual = records(table) # access items it = iter(actual) o = it.next() eq_("a", o["foo"]) eq_(1, o["bar"]) o = it.next() eq_("b", o["foo"]) eq_(None, o["bar"]) # access attributes it = iter(actual) o = it.next() eq_("a", o.foo) eq_(1, o.bar) o = it.next() eq_("b", o.foo) eq_(None, o.bar)
def test_records(): table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = records(table) # access items it = iter(actual) o = it.next() eq_('a', o['foo']) eq_(1, o['bar']) o = it.next() eq_('b', o['foo']) eq_(2, o['bar']) # access attributes it = iter(actual) o = it.next() eq_('a', o.foo) eq_(1, o.bar) o = it.next() eq_('b', o.foo) eq_(2, o.bar)
def test_records_unevenrows(): table = (('foo', 'bar'), ('a', 1, True), ('b', )) actual = records(table) # access items it = iter(actual) o = it.next() eq_('a', o['foo']) eq_(1, o['bar']) o = it.next() eq_('b', o['foo']) eq_(None, o['bar']) # access attributes it = iter(actual) o = it.next() eq_('a', o.foo) eq_(1, o.bar) o = it.next() eq_('b', o.foo) eq_(None, o.bar)
def test_records_unevenrows(): table = (('foo', 'bar'), ('a', 1, True), ('b',)) actual = records(table) # access items it = iter(actual) o = it.next() eq_('a', o['foo']) eq_(1, o['bar']) o = it.next() eq_('b', o['foo']) eq_(None, o['bar']) # access attributes it = iter(actual) o = it.next() eq_('a', o.foo) eq_(1, o.bar) o = it.next() eq_('b', o.foo) eq_(None, o.bar)
def load_to_warehouse(self, db_info): connection = pymysql.connect( host=db_info['host'], user=db_info['user'], password=db_info['passwd'], db=db_info['db'], ) connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES') for table in self.UPDATE_ORDER: data = self.etl_table.TABLES[table]() print(f'Loading {table}...\n{data}') columns = ','.join(etl.header(data)) values = ','.join(['%s'] * len(etl.header(data))) duplicate_updates = ','.join([ f'{column} = VALUES({column})' for column in etl.header(data) ]) query = f"INSERT {table} ({columns}) VALUES ({values}) ON DUPLICATE KEY UPDATE {duplicate_updates};" print(query) connection.cursor().executemany(query, etl.records(data)) connection.close()
def main(argv): global full_name2sk_indiv_id parser = argparse.ArgumentParser() parser.add_argument("--attendance-filename", required=True, nargs='+', action='append', \ help="Attendance filename (input Servant Keeper attendance report file(s)...can be wildcard)") parser.add_argument("--mapping-filename", required=True, help="'Mapping' filename (CSV mapping file with " \ "'Last Name', 'Preferred Name' and 'Individual ID' Servant Keeper data columns)") args = parser.parse_args() # Load up mapping matrix to map from Servant Keeper full_name's to Servant Keeper individual_id's full_name2sk_indiv_id = {} dup_names = {} ids_in_dups = {} with open(args.mapping_filename, 'rb') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: key = row[0] + ', ' + row[1] if key in full_name2sk_indiv_id: if not key in dup_names: dup_names[key] = [full_name2sk_indiv_id[key]] ids_in_dups[full_name2sk_indiv_id[key]] = key dup_names[key].append(row[2]) ids_in_dups[row[2]] = key else: full_name2sk_indiv_id[key] = row[2] attendance_table = join_tables(args.attendance_filename[0], None, False) #print dup_names #print attendance_table reported_dups = set() for row in petl.records(attendance_table): if row['Individual ID'] in ids_in_dups and ids_in_dups[ row['Individual ID']] not in reported_dups: print 'Duplicate name: ' + ids_in_dups[row['Individual ID']] reported_dups.add(ids_in_dups[row['Individual ID']]) sys.stdout.flush() sys.stderr.flush()
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["count_query_kwarg"] = self.count_query_kwarg table = petl.fromcsv(self.object.downloaded_file) context["header"] = petl.header(table) try: record_count_to_show = int(self.request.GET.get(self.count_query_kwarg)) except (TypeError, ValueError): record_count_to_show = self.count_increment # Potentially expensive, cache / save in database for dataset if petl.nrows(table) > record_count_to_show: context[ "load_more_url" ] = f"{self.request.path}?{self.count_query_kwarg}={record_count_to_show+self.count_increment}" context["rows"] = petl.records(petl.head(table, record_count_to_show)) return context
def build_cn_raster(landcover_raster, lookup_csv, soils_polygon, soils_hydrogroup_field="SOIL_HYDRO", reference_raster=None, out_cn_raster=None): """Build a curve number raster from landcover raster, soils polygon, and a crosswalk between landcover classes, soil hydro groups, and curve numbers. :param lookup_csv: [description] :type lookup_csv: [type] :param landcover_raster: [description] :type landcover_raster: [type] :param soils_polygon: polygon containing soils with a hydro classification. :type soils_polygon: [type] :param soils_hydrogroup_field: [description], defaults to "SOIL_HYDRO" (from the NCRS soils dataset) :type soils_hydrogroup_field: str, optional :param out_cn_raster: [description] :type out_cn_raster: [type] """ # GP Environment ---------------------------- msg("Setting up GP Environment...") # if reference_raster is provided, we use it to set the GP environment for # subsequent raster operations if reference_raster: if not isinstance(reference_raster, Raster): # read in the reference raster as a Raster object. reference_raster = Raster(reference_raster) else: reference_raster = Raster(landcover_raster) # set the snap raster, cell size, and extent, and coordinate system for subsequent operations env.snapRaster = reference_raster env.cellSize = reference_raster.meanCellWidth env.extent = reference_raster env.outputCoordinateSystem = reference_raster cs = env.outputCoordinateSystem.exportToString() # SOILS ------------------------------------- msg("Processing Soils...") # read the soils polygon into a raster, get list(set()) of all cell values from the landcover raster soils_raster_path = so("soils_raster") PolygonToRaster_conversion(soils_polygon, soils_hydrogroup_field, soils_raster_path, "CELL_CENTER") soils_raster = Raster(soils_raster_path) # use the raster attribute table to build a lookup of raster values to soil hydro codes # from the polygon (that were stored in the raster attribute table after conversion) if not soils_raster.hasRAT: msg("Soils raster does not have an attribute table. Building...", "warning") BuildRasterAttributeTable_management(soils_raster, "Overwrite") # build a 2D array from the RAT fields = ["Value", soils_hydrogroup_field] rows = [fields] # soils_raster_table = MakeTableView_management(soils_raster_path) with SearchCursor(soils_raster_path, fields) as sc: for row in sc: rows.append([row[0], row[1]]) # turn that into a dictionary, where the key==soil hydro text and value==the raster cell value lookup_from_soils = {v: k for k, v in etl.records(rows)} # also capture a list of just the values, used to iterate conditionals later soil_values = [v['Value'] for v in etl.records(rows)] # LANDCOVER --------------------------------- msg("Processing Landcover...") if not isinstance(landcover_raster, Raster): # read in the reference raster as a Raster object. landcover_raster_obj = Raster(landcover_raster) landcover_values = [] with SearchCursor(landcover_raster, ["Value"]) as sc: for row in sc: landcover_values.append(row[0]) # LOOKUP TABLE ------------------------------ msg("Processing Lookup Table...") # read the lookup csv, clean it up, and use the lookups from above to limit it to just # those values in the rasters t = etl\ .fromcsv(lookup_csv)\ .convert('utc', int)\ .convert('cn', int)\ .select('soil', lambda v: v in lookup_from_soils.keys())\ .convert('soil', lookup_from_soils)\ .select('utc', lambda v: v in landcover_values) # This gets us a table where we the landcover class (as a number) corresponding to the # correct value in the converted soil raster, with the corresponding curve number. # DETERMINE CURVE NUMBERS ------------------- msg("Assigning Curve Numbers...") # Use that to reassign cell values using conditional map algebra operations cn_rasters = [] for rec in etl.records(t): cn_raster_component = Con( (landcover_raster_obj == rec.utc) & (soils_raster == rec.soil), rec.cn, 0) cn_rasters.append(cn_raster_component) cn_raster = CellStatistics(cn_rasters, "MAXIMUM") # REPROJECT THE RESULTS ------------------- msg("Reprojecting and saving the results....") if not out_cn_raster: out_cn_raster = so("cn_raster", "random", "in_memory") ProjectRaster_management(in_raster=cn_raster, out_raster=out_cn_raster, out_coor_system=cs, resampling_type="NEAREST", cell_size=env.cellSize) # cn_raster.save(out_cn_raster) return out_cn_raster
start_time = time.time() # connect to mysql mysql_engine = create_engine('mysql+mysqldb://' + mysql_user + ':' + mysql_pw + '@' + mysql_host + '/' + mysql_db) mysql_engine.connect().execute('SET SQL_MODE=ANSI_QUOTES') # connect to sqlite sqlite_engine = create_engine('sqlite:///' + sqlite_path) # migrate tables for table in tables: print(table) data = etl.fromdb(sqlite_engine, 'select * from ' + table) if table == 'employees': recs = etl.records(data) emails = [] for rec in recs: emails.append(rec['first_name'] + '.' + rec['last_name'] + '@mycompany.com') data2 = etl.addcolumn(data, 'email', emails) else: data2 = data etl.todb(data2, mysql_engine, table, create=True) # load CSV file data = etl.fromcsv(source=socialmedia_csv) recs = etl.records(data) # determine employee numbers empnos = [] for rec in recs:
def write(self, rows, from_srid=None, buffer_size=DEFAULT_WRITE_BUFFER_SIZE): """ Inserts dictionary row objects in the the database Args: list of row dicts, table name, ordered field names This doesn't currently use petl.todb for a few reasons: - petl uses executemany which isn't intended for speed (basically the equivalent of running many insert statements) - calls to DB functions like ST_GeomFromText end up getting quoted; not sure how to disable this. """ # Get fields from the row because some fields from self.fields may be # optional, such as autoincrementing integers. # raise fields = rows.header() geom_field = self.geom_field # convert rows to records (hybrid objects that can behave like dicts) rows = etl.records(rows) # Get geom metadata if geom_field: srid = from_srid or self.get_srid() row_geom_type = re.match('[A-Z]+', rows[0][geom_field]).group() \ if geom_field else None table_geom_type = self.geom_type if geom_field else None # Do we need to cast the geometry to a MULTI type? (Assuming all rows # have the same geom type.) if geom_field: if self.geom_type.startswith('MULTI') and \ not row_geom_type.startswith('MULTI'): multi_geom = True else: multi_geom = False # Make a map of non geom field name => type type_map = OrderedDict() for field in fields: try: type_map[field] = [ x['type'] for x in self.metadata if x['name'] == field ][0] except IndexError: raise ValueError('Field `{}` does not exist'.format(field)) type_map_items = type_map.items() fields_joined = ', '.join(fields) stmt = "INSERT INTO {} ({}) VALUES ".format(self.name, fields_joined) len_rows = len(rows) if buffer_size is None or len_rows < buffer_size: iterations = 1 else: iterations = int(len_rows / buffer_size) iterations += (len_rows % buffer_size > 0) # round up execute = self.db.cursor.execute commit = self.db.dbo.commit # Make list of value lists val_rows = [] cur_stmt = stmt # DEBUG import psycopg2 for i, row in enumerate(rows): val_row = [] for field, type_ in type_map_items: if type_ == 'geometry': geom = row[geom_field] val = self._prepare_geom(geom, srid, multi_geom=multi_geom) val_row.append(val) else: val = self.prepare_val(row[field], type_) val_row.append(val) val_rows.append(val_row) # check if it's time to ship a chunk if i % buffer_size == 0: # Execute vals_joined = [ '({})'.format(', '.join(vals)) for vals in val_rows ] rows_joined = ', '.join(vals_joined) cur_stmt += rows_joined try: execute(cur_stmt) except psycopg2.ProgrammingError: print(self.db.cursor.query) raise commit() val_rows = [] cur_stmt = stmt # Execute remaining rows (TODO clean this up) vals_joined = ['({})'.format(', '.join(vals)) for vals in val_rows] rows_joined = ', '.join(vals_joined) cur_stmt += rows_joined execute(cur_stmt) commit()
# namedtuples() ############### import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] d = etl.namedtuples(table) d list(d) # records() ############### import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] d = etl.records(table) d list(d) # rowgroupby() ############## import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]] # group entire rows for key, group in etl.rowgroupby(table1, 'foo'): print(key, list(group)) # group specific values for key, group in etl.rowgroupby(table1, 'foo', 'bar'): print(key, list(group))
def gather_name_by_indiv_id(table): sk_indiv_id2name = {} for row in petl.records(table): sk_indiv_id2name[row[ 'Individual ID']] = row['Preferred Name'] + ' ' + row['Last Name'] return sk_indiv_id2name
def gather_name_by_indiv_id(table): sk_indiv_id2name = {} for row in petl.records(table): sk_indiv_id2name[row['Individual ID']] = row['Preferred Name'] + ' ' + row['Last Name'] return sk_indiv_id2name
import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] d = etl.namedtuples(table) d list(d) # records() ############### import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] d = etl.records(table) d list(d) # rowgroupby() ############## import petl as etl table1 = [["foo", "bar", "baz"], ["a", 1, True], ["b", 3, True], ["b", 2]] # group entire rows for key, group in etl.rowgroupby(table1, "foo"): print(key, list(group)) # group specific values