def synctable(self, sourceDb, targetDb, sourceTable, targetTable): sourceCursor = sourceDb.cursor() targetCursor = targetDb.cursor() affected_total = 0 init_rowCount = targetTable.rowCount if targetTable.rowCount < sourceTable.rowCount else sourceTable.rowCount pbar = tqdm(total=sourceTable.rowCount, unit='records') pbar.update(init_rowCount) while sourceTable.lastUpdatetime > targetTable.lastUpdatetime: affected_rows = 0 batchSize = 100000 sql = "SELECT * FROM (SELECT * FROM {schema}.{tablename} WHERE {timestamp}>=to_timestamp('{last_updatetime}','yyyy-mm-dd hh24:mi:ss.ff6') ORDER BY {timestamp}) WHERE ROWNUM<={batch_size}".format( timestamp=sourceTable.timestampField, schema=sourceTable.schema, tablename=sourceTable.tablename, last_updatetime=targetTable.lastUpdatetime, batch_size=batchSize) sourceRecord = etl.fromdb(lambda: CursorProxy(sourceDb.cursor()), sql) targetRecord = etl.fromdb( lambda: CursorProxy(targetDb.cursor()), "SELECT * FROM {schema}.{tablename} WHERE 1=0".format( schema=targetTable.schema, tablename=targetTable.tablename)) sourceTable.columns = etl.header(sourceRecord) targetTable.columns = etl.header(targetRecord) for column in list( set(sourceTable.columns) - set(targetTable.columns)): sourceRecord = etl.cutout(sourceRecord, column) max_updatetime = sourceRecord.cut( sourceTable.timestampField).skip(1).max()[0] sourceRecord = sourceRecord.sort(sourceTable.timestampField) etl.appenddb(sourceRecord, CursorProxy(targetCursor), targetTable.tablename, schema=targetTable.schema, commit=True) affected_rows += targetCursor.rowcount targetTable.lastUpdatetime = max_updatetime.strftime( '%Y-%m-%d %H:%M:%S.%f') targetTable.rowCount += affected_rows pbar.update(affected_rows if init_rowCount + affected_total + affected_rows < sourceTable.rowCount else sourceTable.rowCount - init_rowCount - affected_total) affected_total += affected_rows pbar.set_description("%s |%d records updated." % (targetTable.tablename, affected_total)) if targetTable.lastUpdatetime > sourceTable.lastUpdatetime: pbar.set_description("%s |timestamp >, skip." % (targetTable.tablename)) elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount == sourceTable.rowCount: pbar.set_description("%s |no data change." % (targetTable.tablename)) elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount > sourceTable.rowCount: pbar.set_description("%s |RowCount > but timestamp ==, skip." % (targetTable.tablename)) elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount < sourceTable.rowCount: pbar.set_description("%s |RowCount < but timestamp ==, skip." % (targetTable.tablename)) pbar.close()
def get_delta(source_table, target_table, key='id'): source_table_headers = etl.header(source_table) target_table_headers = etl.header(target_table) if source_table_headers != target_table_headers: raise Exception( 'Source table columns do not match target table columns') source_ids = etl.cut(source_table, key) target_ids = etl.cut(target_table, key) added_ids_table, _ = etl.diff(source_ids, target_ids) merged_table = etl.merge(source_table, target_table, key=key) load_frame = etl.todataframe( etl.selectin(target_table, key, etl.values(added_ids_table, key))) print(load_frame) for row in etl.data(merged_table): for i, col in enumerate(row): if isinstance(col, etl.transform.reductions.Conflict): changes = tuple(col) print('For car {}, {} changed from {} to {}'.format( row[0], source_table_headers[i], changes[1], changes[0])) row_dict = dict(zip(source_table_headers, list(row))) row_dict[source_table_headers[i]] = changes[0] row_dict = {key: [val] for (key, val) in row_dict.items()} print(row_dict) df = pd.DataFrame(row_dict) load_frame = load_frame.append(df, ignore_index=True) break return etl.fromdataframe(load_frame)
def test_no_header_mutation(): data = [ ['dur', 'description', 'start', 'alpha'], [.4, 'test 1', datetime(2000, 1, 1, 15, 15), 0], [.7, 'test 1', datetime(2000, 1, 1, 20, 15), 0], [1.6, 'test 1', datetime(2000, 1, 20, 15, 15), 0], [8.4, 'test 1', datetime(2000, 1, 20, 20, 15), 0], ] result = group_entries_by_day(data) assert set(petl.header(data)) == set(petl.header(result))
def load(tables_by_id, output_folder, devices): for device_id in tables_by_id: name = valid_name(devices[device_id]['name']) tbl_device_file = path.join(output_folder, f"{name}.csv") if path.isfile(tbl_device_file): tbl_old = petl.fromcsv(tbl_device_file, delimiter=';') old_header = petl.header(tbl_old) new_header = petl.header(tables_by_id[device_id]) if old_header == new_header: petl.appendcsv(tables_by_id[device_id], source=tbl_device_file, delimiter=';') else: # TODO: write to the new file raise ValueError(f"Incompatible headers:\n old={old_header}\n new={new_header}") else: petl.tocsv(tables_by_id[device_id], tbl_device_file, delimiter=';')
def toxlswithheader(table, filename, sheet, **kwargs): """ Use `petl.tocsv` to write CSV data in `table` to file `source`, including key-value metadata header if passed in as the keyword argument `metadata`. The first row in `table` is assumed to contain the header columns. """ metadata = kwargs.pop("metadata", {}) # prepare header header = petl.header(table) # prepare metadata rows using #-prefix, and :-suffix for keys metadata_rows = [] for key, value in metadata.items(): metadata_row = [''] * len(header) metadata_row[0] = '#' + str(key) + ':' metadata_row[1] = str(value) metadata_rows.append(metadata_row) # prepare data (stripped of header) data = petl.data(table) # combine metadata + header + data the write out combined = metadata_rows + [header] + list(data) petl.toxls(combined, filename, sheet, **kwargs)
def aggregate_collection( request: HttpRequest, collection_id: int, ) -> HttpResponse: """ Value count computations could be also moved into a celery task that would prepare the answer for the user and bring it to him later (via email or on page with results). """ collection = get_object_or_404(StarWarsCollection, id=collection_id) table = etl.fromcsv(collection.filepath) aggregate_keys, parameters_settings = parse_parameters( request.GET.get( 'current_parameters', '0000001001', ), ) if len(aggregate_keys) == 1: # aggregate does not work correctly # if list with 1 element is passed aggregate_keys = aggregate_keys[0] if len(aggregate_keys) == 0: # show no table if every option is disabled table = etl.empty() else: table = table.aggregate(key=aggregate_keys, aggregation=len) return render( request, 'main/collection_aggregate.html', { 'collection': collection, 'parameters_settings': parameters_settings, 'headers': etl.header(table), 'data': etl.data(table), }, )
def get_context_data(self, **kwargs): def url(name, value): 'return present url query with updated name-value pair' qd = self.request.GET.copy() if value: if isinstance(value, type([])): qd.setlist(name, value) else: qd[name] = value else: del qd[name] return qd.urlencode() def xor(list1, list2): return list(set(list1) ^ set(list2)) context = super().get_context_data(**kwargs) csv = str(settings.MEDIA_ROOT / str(self.object.name)) + '.csv' table = etl.fromcsv(csv) group = self.request.GET.getlist('group', None) if group: context['buttons'] = {field: [url('group',xor(group, [field])), field in group] for field in etl.header(table)} context['header'] = {field: '' for field in group + ['Count']} context['load'] = url('group', None) context['rows'] = table.aggregate(key=group[0] if len(group)==1 else group, aggregation=len).records() else: load = int(self.request.GET.get('load', 10)) context['header'] = {field: url('group',field) for field in etl.header(table)} table = table.head(load + 1) if table.len() > load + 1: # Notice: table header is counted as a row too context['load_more'] = url('load', load + 10) context['rows'] = table.head(load).records() return context
def test_header(): """Test the header function.""" table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = header(table) expect = ('foo', 'bar') eq_(expect, actual)
def load_table_data(csv_file, limit) -> TableData: table = etl.fromcsv(csv_file) return TableData( header=etl.header(table), data=etl.data(table, limit), next_limit=limit + 10 if limit < table.len() else None, )
def tryExtractors(filename, encoding, delimiter_list=[',','\t', ';', '|'], quality_metric= lambda x: len(x)): result = [] for d in delimiter_list: csvView = loadCSV(filename, encoding, delimiter=d) result.append((quality_metric(etl.header(csvView)), csvView)) result.sort() return cleanFormatTable(result[-1][1])
def columns(self): """ `Returns:` list List of the table's column names """ return list(petl.header(self.table))
def load_more(filepath, page, total_items): length = 10 offset = (page - 1) * length table = etl.fromcsv(filepath) rows = etl.rowslice(table, offset, offset + length) next_page = page + 1 if offset + length < total_items else None return etl.header(table), list(rows.dicts()), next_page
def validate(cursor, table, constraints, task_name): header = etl.header(table) problems = etl.validate(table, constraints=constraints, header=header) problems = etl.addfield(problems, 'task_name', task_name) problems = etl.addfield(problems, 'create_date', datetime.now()) # etl.todb(problems, cursor, 'etl_logs') etl.appenddb(problems, cursor, 'tetl_logs')
def precip_table_etl_noaa( precip_table, rainfall_adjustment=1, frequency_min=1, frequency_max=1000, conversion_factor=2.54, desc_field="by duration for ARI (years):", duration_val="24-hr:" ): """ Extract, Transform, and Load data from a NOAA PRECIPITATION FREQUENCY ESTIMATES matrix (in a csv) into an array used by the runoff calculator. Required Inputs: - precip_table: NOAA PRECIPITATION FREQUENCY ESTIMATES csv, in inches. Optional Inputs: - rainfall_adjustment: multipler to adjust for future rainfall conditions. defaults to 1. - frequency_min: the min. annual frequency to be returned. Default: 1 - frequency_max: the max. annual frequency to be returned. Default: 1000 - conversion_factor: apply to rainfall values. Default: 2.54 (convert inches to centimeters). - desc_field: exact field name from NOAA table in first column. Defaults to "by duration for ARI (years):". Used for selecting data. - duration_val: exact row value in the desc_field from NOAA table that contains the duration of interest. Defaults to "24-hr:". Used for selecting data. Outputs: - precip_array: 1D array containing 24-hour duration estimate for frequencies 1,2,5,10,25,50,100,200,500,and 1000 year storm events """ # load the csv table, skip the file header information, extract rows we need t1 = etl\ .fromcsv(precip_table)\ .skip(13)\ .rowslice(0,19) # grab raw data from the row containing the x-hour duration event info t2 = etl\ .select(t1, desc_field, lambda v: v == duration_val)\ .cutout(desc_field) # generate a new header with only columns within frequency min/max h = tuple( i for i in list(etl.header(t2)) if (int(i) >= frequency_min and int(i) <= frequency_max) ) # for events within freq range, convert to cm, adjust for future rainfall t3 = etl\ .cut(t2, h)\ .convertall(lambda v: round(float(v) * conversion_factor * rainfall_adjustment, 2)) # convert to a 1D array (values cast to floats) precips = list(etl.data(t3)[0]) # also convert to a dictionary, for lookup by event precips_lookup = list(etl.dicts(t3))[0] # return 1D array and dictionary return precips, precips_lookup
def load_grouped_data(csv_file, fields) -> TableData: table = etl.fromcsv(csv_file) if len(fields) == 1: fields = fields[0] return TableData( header=etl.header(table), data=etl.aggregate(table, key=fields, aggregation=len), next_limit=None, )
def get_context_data(self, **kwargs) -> dict: c = super().get_context_data(**kwargs) table = self.object.get_table() buttons = etl.header(table) offset = int(self.request.GET.get('offset', 1)) offset_to = offset * self.page_size if aggregation_keys := tuple( set(buttons).intersection(set(self.request.GET.keys()))): table = self.object.get_aggregate_data(aggregation_keys)
def create_config(csvfile,config_name): ''' Creates a configuration file from a CSV file ''' print csvfile var = '' try: open(config_name+".ini") var = raw_input("This file already exists. Do you wish to continue? (Yes/No) ") except: pass if var == 'Yes': cfgfile = open(config_name+".ini", "w") examplefile = open(config_name+".example", "w") else: print "goodbye" sys.exit() c = fromcsv(csvfile) columns = header(c) it = iterdata(c) print it.next() examplefile.write(str(see(rowslice(c,2,3)))) examplefile.close() # add the settings to the structure of the file, and lets write it out... Config = ConfigParser.ConfigParser() # dont' change names to lower case Config.optionxform = str Config.add_section('FieldTypes') Config.add_section('FieldMap') for name in columns: #Config.set('FieldTypes',c) #print name new = name new = new.split("(", 1)[0].strip() # Connect words with underscore new = new.replace("/","_") new = new.replace(" ","_") new = new.replace("-","_") new = new.lower() # try to guess some of the names if "amount" in name: print name Config.set('FieldMap',name, new + " FLOAT") else: print name Config.set('FieldMap',name, new + " VARCHAR(10)") Config.write(cfgfile) cfgfile.close()
def load_to_warehouse(self, db_info): connection = pymysql.connect( host=db_info['host'], user=db_info['user'], password=db_info['passwd'], db=db_info['db'], ) connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES') for table in self.UPDATE_ORDER: data = self.etl_table.TABLES[table]() print(f'Loading {table}...\n{data}') columns = ','.join(etl.header(data)) values = ','.join(['%s'] * len(etl.header(data))) duplicate_updates = ','.join([ f'{column} = VALUES({column})' for column in etl.header(data) ]) query = f"INSERT {table} ({columns}) VALUES ({values}) ON DUPLICATE KEY UPDATE {duplicate_updates};" print(query) connection.cursor().executemany(query, etl.records(data)) connection.close()
def store_to_db(self, conn, tablename, data): try: if etl.nrows(data) == 0: return None except TypeError: return None cursor = conn.cursor() sql = "INSERT INTO %s (%s) " % (tablename, ','.join( etl.header(data))) + "VALUES %s" execute_values(cursor, sql, etl.data(data)) conn.commit() conn.close()
def produce_weather_data(count): result = [] output_data = petl.fromcsv(config.weather_data_output_path, delimiter='|') header = petl.header(output_data) result.insert(0, header) for key, group in petl.rowgroupby(output_data, key='Position'): random_index = random.randint(0, config.available_records - 1) group_list = list(group) result.append(tuple(group_list[random_index])) result = result[0:count] merged_output = ['|'.join(data) for data in result] merged_output = '\n'.join(merged_output) return merged_output
def collection(request: HttpRequest, pk: int) -> HttpResponse: limit = int(request.GET.get('limit', DEFAULT_LIMIT)) columns: List[str] = request.GET.getlist('columns') collection: CollectionModel = get_object_or_404(CollectionModel, pk=pk) table: CSVView = etl.fromcsv(f'{settings.DATA_PATH}/{collection.filename}') if columns: table: CutOutView = table.valuecounts(*columns).cutout('frequency') headers: tuple = etl.header(table) rows: DataView = table.data() limit = None else: rows: DataView = table.data(0, limit) headers: tuple = etl.header(table) limit = limit + DEFAULT_LIMIT return render( request, 'collection.html', { 'collection': collection, 'headers': headers, 'rows': rows, 'limit': limit, })
def group_entries_by_day(inp): hdr = petl.header(inp) agg = OrderedDict() for field in hdr: # using first found value agg[field] = field, next agg['dur'] = 'dur', lambda durs: sum(durs, timedelta()) agg['start'] = 'start', min with_day = petl.addfield(inp, 'start_date', lambda row: row.get('start').date()) index_keys = ('start_date', 'description') result = petl.aggregate(with_day, index_keys, agg) return petl.cutout(result, 'start_date')
def test_basics(): t1 = (('foo', 'bar'), ('A', 1), ('B', 2)) w1 = etl.wrap(t1) eq_(('foo', 'bar'), w1.header()) eq_(etl.header(w1), w1.header()) ieq((('A', 1), ('B', 2)), w1.data()) ieq(etl.data(w1), w1.data()) w2 = w1.cut('bar', 'foo') expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B')) ieq(expect2, w2) ieq(etl.cut(w1, 'bar', 'foo'), w2) w3 = w1.cut('bar', 'foo').cut('foo', 'bar') ieq(t1, w3)
def test_basics(): t1 = (('foo', 'bar'), ('A', 1), ('B', 2)) w1 = FluentWrapper(t1) eq_(('foo', 'bar'), w1.header()) eq_(petl.header(w1), w1.header()) iassertequal((('A', 1), ('B', 2)), w1.data()) iassertequal(petl.data(w1), w1.data()) w2 = w1.cut('bar', 'foo') expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B')) iassertequal(expect2, w2) iassertequal(petl.cut(w1, 'bar', 'foo'), w2) w3 = w1.cut('bar', 'foo').cut('foo', 'bar') iassertequal(t1, w3)
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["columns_query_kwarg"] = self.columns_query_kwarg table = petl.fromcsv(self.object.downloaded_file) full_table_header = list(petl.header(table)) context["column_options"] = full_table_header selected_columns = [c for c in self.request.GET.getlist(self.columns_query_kwarg) if c in full_table_header] context["selected_columns"] = selected_columns if selected_columns: context["header"] = selected_columns + ["Count"] context["rows"] = petl.records( petl.aggregate(table, selected_columns[0] if len(selected_columns) == 1 else selected_columns, len) ) return context
def load(request): filename = request.GET.get('name','') fullpath = settings.DATA_DIR+filename """ detector = UniversalDetector() file_open = open(fullpath) for line in file_open.readlines(): detector.feed(line) if detector.done: break detector.close() file_open.close() """ encoding = 'ascii'#detector.result['encoding'] response_data = {} a = tryExtractors(fullpath, encoding) response_data['result'] = [row for row in etl.head(a)] response_data['headers'] = etl.header(a) typeInference(a) return HttpResponse(json.dumps(response_data), content_type="application/json")
def validate_input(self, input): input_header = petl.header(input) assert self.input_fields.org_name in input_header, ( f'Column "{self.input_fields.org_name}" not in input {input_header}' ) assert self.input_fields.settlement in set(input_header) | {None}, ( f'Column "{self.input_fields.settlement}" not in input {input_header}' ) assert self.input_fields.date in set(input_header) | {None}, ( f'Column "{self.input_fields.date}" not in input {input_header}') # output fields must not exist new_fields = { field_name(f, i) for f in self.output_fields.as_set for i in range(self.extramatches + 1) } assert set(input_header).isdisjoint(new_fields), ( 'Column[s] {} are already in input'.format( set(input_header).intersection(new_fields)))
def etl(self, *args, **kw): table = petl.fromxlsx(self._src_path) model = DEPTH_TO_WATER self._update_model(model, self._vocab) # group table by sys_loc_code header = petl.header(table) for name, records in petl.rowgroupby(petl.sort(table, 'sys_loc_code'), 'sys_loc_code'): records = [dict(zip(header, record)) for record in records] record = records[0] location_id = self._post_location(record, model) thing_id = self._post_thing(record, model, location_id) print('---------------') print(f'len records {len(records)}') # self.add_package(record) self.observation.set_records(records) self.observation.etl(tids=self._make_tids(thing_id, record), models=(model, ))
def test_basics(): t1 = (('foo', 'bar'), ('A', 1), ('B', 2)) w1 = etl(t1) eq_(('foo', 'bar'), w1.header()) eq_(petl.header(w1), w1.header()) ieq((('A', 1), ('B', 2)), w1.data()) ieq(petl.data(w1), w1.data()) w2 = w1.cut('bar', 'foo') expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B')) ieq(expect2, w2) ieq(petl.cut(w1, 'bar', 'foo'), w2) w3 = w1.cut('bar', 'foo').cut('foo', 'bar') ieq(t1, w3)
class CollectionDetailsView(DetailView): template_name = 'collection_details.html' queryset = SWPeopleCollection.objects.all() page_size = 10 def get_context_data(self, **kwargs) -> dict: c = super().get_context_data(**kwargs) table = self.object.get_table() buttons = etl.header(table) offset = int(self.request.GET.get('offset', 1)) offset_to = offset * self.page_size if aggregation_keys := tuple( set(buttons).intersection(set(self.request.GET.keys()))): table = self.object.get_aggregate_data(aggregation_keys) # this is essentially to speed up rendering as it would be slow in django template # putting this in templatetag would be more elegant - extend petl to render html directly into a template # would also be nice data = '' for row in etl.data(table, 0, offset_to): data += '<tr><td>' + '</td><td>'.join(row) + '</td></tr>' c.update({ 'headers': etl.header(table), 'buttons': buttons, 'data': data, 'offset': offset + 1, 'offset_extra_params': '&'.join(['{}=on'.format(i) for i in aggregation_keys]), 'offset_reached': table.len() < offset_to, 'aggregation_keys': aggregation_keys }) return c
def records_for_update(source, existing, update_keys, key='id', source_key=None, existing_key=None): ''' Return a petl compatible list of data which represents any source rows whose keys appear in update_keys and whose data is different from the corresponding row in existing. :param source: A petl table of source data :param existing: A petl table of existing data :param update_keys: A list of keys prefiltered to include only those keys that _could_ be source_update_candidates :param key: The name of the primary key field ''' if source_key is None and existing_key is None: source_key = existing_key = key source_update_candidates = petl.transform.select(source, lambda rec: rec[source_key] in update_keys).lookup(source_key) existing_update_candidates = petl.transform.select(existing, lambda rec: rec[existing_key] in update_keys).lookup(existing_key) to_update = [petl.header(source)] for k, source_rec in source_update_candidates.items(): existing_rec = existing_update_candidates[k] if will_change(source_rec, existing_rec): to_update.append(source_rec) return to_update
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["count_query_kwarg"] = self.count_query_kwarg table = petl.fromcsv(self.object.downloaded_file) context["header"] = petl.header(table) try: record_count_to_show = int(self.request.GET.get(self.count_query_kwarg)) except (TypeError, ValueError): record_count_to_show = self.count_increment # Potentially expensive, cache / save in database for dataset if petl.nrows(table) > record_count_to_show: context[ "load_more_url" ] = f"{self.request.path}?{self.count_query_kwarg}={record_count_to_show+self.count_increment}" context["rows"] = petl.records(petl.head(table, record_count_to_show)) return context
def make_sqlalchemy_table(table, tablename, schema=None, constraints=True, metadata=None): """ Create an SQLAlchemy table based on a :mod:`petl` table. Parameters ---------- table : sequence of sequences (petl table) Table data to use to infer types etc. tablename : string Name of the table schema : string Name of the database schema to create the table in constraints : bool If True use length and nullable constraints metadata : sqlalchemy.MetaData Custom table metadata """ try: import sqlalchemy except ImportError as e: raise UnsatisfiedDependency(e, dep_message) if not metadata: metadata = sqlalchemy.MetaData() sql_table = sqlalchemy.Table(tablename, metadata, schema=schema) fields = header(table) cols = columns(table) for f in fields: sql_column = make_sqlalchemy_column(cols[f], f, constraints=constraints) sql_table.append_column(sql_column) return sql_table
def attendance_file2table(filename, output_csv_filebase, add_extra_fields): global full_name2sk_indiv_id print "*** Parsing file: " + filename print attendance_dicts = [] # CCB's Worship Service event IDs... event_ids = {} event_ids["8"] = 6 event_ids["9"] = 7 event_ids["10"] = 8 event_ids["11:15"] = 9 event_ids["Christmas"] = 13 # The following are used to create CSV output filenames and to emit human-readable event name if add_extra_fields # flag is on event_names = {} event_names[6] = "08am" event_names[7] = "09am" event_names[8] = "10am" event_names[9] = "11_15am" event_names[13] = "Christmas Eve" # Time of event in Excel-parseable format event_times = {} event_times[6] = "08:00 AM" event_times[7] = "09:00 AM" event_times[8] = "10:00 AM" event_times[9] = "11:15 AM" event_times[13] = "04:00 PM" # Starting state... prior_line = None matched_month_year = None matched_service_time = None month = None year = None service_time = None line_number = 1 total_row_dict = None event_id = None accumulated_row_totals_dict = {"week1": 0, "week2": 0, "week3": 0, "week4": 0, "week5": 0, "week6": 0, "total": 0} full_name = None phone = None num_processed_lines = 0 for line in open(filename): # First pick off line at front of file indicating month and year that this attendance file is for... if not matched_month_year: matched_month_year = re.search("For the month of ([A-Z][a-z]+), ([0-9]{4})", line) if matched_month_year: month = string2monthnum(matched_month_year.group(1)) year = string2yearnum(matched_month_year.group(2)) if not (month and year): print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, "*** ERROR! Invalid month or year found" print >> sys.stderr, line print >> sys.stderr sys.exit(1) first_day_in_month, num_days_in_month = calendar.monthrange(year, month) # Create list of 6 date objects, month_sundays, representing week1, week2, ... week6 Sunday dates # If a week has no Sunday, it is None day_countup = 1 day_countup += 6 - first_day_in_month month_sundays = [] if first_day_in_month != 6: month_sundays.append(None) while day_countup <= num_days_in_month: month_sundays.append(datetime.date(year, month, day_countup)) day_countup += 7 while len(month_sundays) < 6: month_sundays.append(None) christmas_eve_date = datetime.date(year, month, 24) # Second pick off line at front of file indicating worship service time that this attendance file is for... elif not matched_service_time: matched_service_time = re.search("Worship Service - (Sunday |Summer )?([^ ]*)", line) if matched_service_time: service_time = matched_service_time.group(2) if service_time in event_ids: event_id = event_ids[service_time] event_name = event_names[event_id] else: print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, '*** ERROR! Unrecognized service_time: "' + service_time + '"' print >> sys.stderr sys.exit(1) # ...then match attendance (row per person with weeks they attended) and total (summary at bottom) rows else: # Once we found row with totals...we're done, that's last line in attendance file we need to parse matched_total_line = re.search("^ {18}Total: {13}(?P<attendance>( +[0-9]+)+)\r?$", line) if matched_total_line: totals_attendance_dict = attendance_str2dict( matched_total_line.group("attendance"), [-3, -9, -15, -20, -24, -29, -35], 3 ) break matched_attendance_line = re.search( "^ {6}" + "(?P<full_name>(?P<last_name>[A-Za-z]+([ \-'][A-Za-z]+)*), " + "(?P<first_name>([A-Za-z]+\.?)+([\-' ][A-Za-z]+)*)( \((?P<nick_name>[A-Za-z]+)\))?\.?)?\r?" + "(?P<phone>( +)?([0-9]{3}-[0-9]{3}-[0-9]{4}|Unlisted))?" + "(?P<attendance> +(1 +)+[1-6])?\r?$", line, ) if matched_attendance_line: if matched_attendance_line.group("full_name"): full_name = matched_attendance_line.group("full_name").strip() if matched_attendance_line.group("phone"): phone = matched_attendance_line.group("phone").strip() if matched_attendance_line.group("attendance"): if full_name: attendance = matched_attendance_line.group("attendance").strip() row_dict = attendance_str2dict(attendance, [-1, -7, -13, -18, -22, -27, -33], 1) row_dict["full_name"] = full_name if phone: row_dict["phone"] = phone else: row_dict["phone"] = "" num_processed_lines += 1 full_name = None phone = None if row_dict["total"] != ( row_dict["week1"] + row_dict["week2"] + row_dict["week3"] + row_dict["week4"] + row_dict["week5"] + row_dict["week6"] ): print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, "*** ERROR! Bad row total, doesn't match sum of weeks 1-6" print >> sys.stderr, row_dict print >> sys.stderr break for key in accumulated_row_totals_dict: accumulated_row_totals_dict[key] += row_dict[key] attendance_dicts.append(row_dict) # Buffer the current line for line folding if needed (see 'line folding' above) prior_line = line line_number += 1 print "*** Number of attendance lines processed: " + str(num_processed_lines) print "*** Number of attendees: " + str(accumulated_row_totals_dict["total"]) print if output_csv_filebase and event_id: output_csv_filename = ( output_csv_filebase + "/" + str(year) + format(month, "02d") + "_" + str(event_names[event_id]) + ".csv" ) all_columns_table = petl.fromdicts(attendance_dicts) petl.tocsv(all_columns_table, output_csv_filename) # Build 2nd list of dicts, where each list item is dict of individual date/event attendance. I.e. a row per # worship service date vs original attendance dicts format of a row per attendee across all weeks in month. # This is the actual one returned and eventually emitted into output file attendance_dicts2 = [] for attendance_dict in attendance_dicts: for key in attendance_dict: if key[:4] == "week" and attendance_dict[key] != 0: week_index = int(key[4:5]) - 1 if month_sundays[week_index] is not None: attendance_dict2 = {} full_name = attendance_dict["full_name"] if full_name in full_name2sk_indiv_id: attendance_dict2["Individual ID"] = full_name2sk_indiv_id[full_name] if event_name == "Christmas Eve": attendance_dict2["Date"] = christmas_eve_date else: attendance_dict2["Date"] = month_sundays[week_index] attendance_dict2["Event ID"] = event_id if add_extra_fields: attendance_dict2["Time"] = event_times[event_id] attendance_dict2["Full Name"] = full_name attendance_dict2["Event Name"] = event_name attendance_dict2["Week Num"] = week_index + 1 attendance_dicts2.append(attendance_dict2) else: print >> sys.stderr, '*** WARNING! Cannot find "' + full_name + '" in map' print >> sys.stderr else: print >> sys.stderr, '*** WARNING! Cannot find Sunday date for week index "' + str(week_index) + '"' print >> sys.stderr # Check if numbers on Servant Keeper's reported Total: line match the totals we've been accumulating # per attendance row entry. If they don't match, show WARNING (not ERROR, since via manual checks, it appears # that Servant Keeper totals are buggy) if totals_attendance_dict: for key in accumulated_row_totals_dict: if accumulated_row_totals_dict[key] != totals_attendance_dict[key]: pp = pprint.PrettyPrinter(stream=sys.stderr) print >> sys.stderr, "*** WARNING! Servant Keeper reported totals do not match data totals" print >> sys.stderr, "Servant Keeper Totals:" pp.pprint(totals_attendance_dict) print >> sys.stderr, "Data Totals:" pp.pprint(accumulated_row_totals_dict) print >> sys.stderr break return_table = petl.fromdicts(attendance_dicts2) header = petl.header(return_table) if "Event Name" in header: return_table = petl.cut( return_table, "Full Name", "Event Name", "Time", "Week Num", "Date", "Event ID", "Individual ID" ) else: return_table = petl.cut(return_table, "Date", "Event ID", "Individual ID") return return_table
# csv = comma delimited, tsv = tab delimited pre_etl_time = time.time() a = etl.fromtsv('snpdata.csv') post_etl_time = time.time() b = etl.fromtsv('popdata.csv') pre_df_time = time.time() df_a = pd.read_csv('snpdata.csv', sep='\t', header=0) post_df_time = time.time() print("ETL time to load A file: {} Pandas time to load A file: {}".format( post_etl_time - pre_etl_time, post_df_time - pre_df_time)) df_b = pd.read_csv('popdata.csv', sep='\t', header=0) header_a = etl.header(a) header_b = etl.header(b) if _DEBUG: print(header_a) print(header_b) b_renamed = b.rename({ 'Chromosome': 'Chr', 'Coordinates': 'Pos', 'Ref. Allele': 'Ref', 'Non-Ref. Allele': 'Nref', 'Derived Allele': 'Der', 'Mutation type': 'Mut', 'Gene': 'GeneId', 'Gene Aliases': 'GeneAlias', 'Gene Description': 'GeneDescr'
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS, help="Column names of columns containing comma- or semi-colon-separated values") parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
def test_header(): table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = header(table) expect = ('foo', 'bar') eq_(expect, actual)
list(bar) # values from multiple fields table2 = [["foo", "bar", "baz"], [1, "a", True], [2, "bb", True], [3, "d", False]] foobaz = etl.values(table2, "foo", "baz") foobaz list(foobaz) # header() ########## import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] etl.header(table) # fieldnames() ############## import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] etl.fieldnames(table) etl.header(table) # data() ########
def cleanFormatTable(table): newtable = table for h in etl.header(table): newtable = etl.convert(table, h, sanitize) return newtable
def typeInference(table): for h in etl.header(table): col = etl.cut(table, h) print etl.nrows(col)
def test_header(): table = (("foo", "bar"), ("a", 1), ("b", 2)) actual = header(table) expect = ("foo", "bar") eq_(expect, actual)