def etl_from_dir(self, data_dir="data"): """Extract, translate, load reinstatements (and not exclusions) from directory DATA_DIR. """ # Get YYYYMM date of most recent reinstatement action most_recent = self.conn.get_latest_reinstatement_date().replace( '-', '')[:6] or "000000000" # Get the data from REIN CSV files. Gather reinstatement actions # since most_recent total_indiv = [] total_bus = [] for fname in sorted(glob.glob(os.path.join(data_dir, "*REIN.csv"))): if int(os.path.basename(fname)[:4]) <= int(most_recent[2:]): continue debug("Processing " + fname) reinstated = etl.fromcsv(fname) individual, business = clean_and_separate(reinstated) total_indiv.append(individual) total_bus.append(business) # Save to db, APPENDING TO existing data tables. Assumes tables # exist. if total_indiv: etl.appenddb(etl.cat(*total_indiv), self.conn.conn, 'individual_reinstatement') if total_bus: etl.appenddb(etl.cat(*total_bus), self.conn.conn, 'business_reinstatement') # It is possible to end up with duplicate rows if, say, an ETL # process is interrupted midway through. So we should find and # remove dupes. self.conn.dedupe_reinstatements()
def cat(data, source, destination, header, missing): """Return the first n rows of the data table and store them in the destination data table.""" sources = [data.get(s) for s in source] if len(header) == 0: n = petl.cat(*sources, missing=missing) else: n = petl.cat(*sources, header=header, missing=missing) data.set(destination, n)
def transform_resource(self, source, target): if isinstance(self.__resource, str): self.__resource = source.package.get_resource(self.__resource) self.__resource.infer(only_sample=True) view1 = source.to_petl() view2 = self.__resource.to_petl() # Ignore fields if self.__ignore_fields: target.data = petl.stack(view1, view2) for field in self.__resource.schema.fields[len(target.schema.fields ):]: target.schema.add_field(field) # Default else: if self.__sort: target.data = petl.mergesort(view1, view2, key=self.__sort, header=self.__field_names) else: target.data = petl.cat(view1, view2, header=self.__field_names) for field in self.__resource.schema.fields: if field.name not in target.schema.field_names: target.schema.add_field(field) if self.__field_names: for field in list(target.schema.fields): if field.name not in self.__field_names: target.schema.remove_field(field.name)
def transform_resource(self, resource): target = resource source = self.get("resource") field_names = self.get("fieldNames") ignore_fields = self.get("ignoreFields") sort_by_field = self.get("sortByField") if isinstance(source, str): source = target.package.get_resource(source) source.infer() view1 = target.to_petl() view2 = source.to_petl() # Ignore fields if ignore_fields: for field in source.schema.fields[len(target.schema.fields):]: target.schema.add_field(field) resource.data = petl.stack(view1, view2) # Default else: for field in source.schema.fields: if field.name not in target.schema.field_names: target.schema.add_field(field) if field_names: for field in list(target.schema.fields): if field.name not in field_names: target.schema.remove_field(field.name) if sort_by_field: key = sort_by_field resource.data = petl.mergesort(view1, view2, key=key, header=field_names) else: resource.data = petl.cat(view1, view2, header=field_names)
def _get_actors(self): student_actors = ( etl .fromcsv(f'{self._dirc}/student.csv', delimiter=';') .cut('id') .addcolumn('role', [], missing='Student') ) teacher_actors = ( etl .fromcsv(f'{self._dirc}/teacher.csv', delimiter=';') .cut('id') .addcolumn('role', [], missing='Teacher') ) # team_user_actors = ( # etl # .fromcsv(f'{self._dirc}/team_allocation.csv', delimiter=';') # .cut('team_id') # .rename('team_id', 'id') # .convert('id', str) # .addcolumn('role', [], missing='Team') # .convert('role', str) # ) return etl.cat(student_actors, teacher_actors)
def _get_participants(self): student_actors = (etl.fromcsv(f'{self._dirc}/student.csv', delimiter=';').cut('id')) teacher_actors = (etl.fromcsv(f'{self._dirc}/teacher.csv', delimiter=';').cut('id')) return etl.cat(student_actors, teacher_actors)
def _get_actor_pariticipants(self): student_actors = ( etl .fromcsv(f'{self._dirc}/student.csv', delimiter=';') .cut('id', 'id') .rename(0, 'actor_id') .rename(1, 'participant_id') ) teacher_actors = ( etl .fromcsv(f'{self._dirc}/teacher.csv', delimiter=';') .cut('id', 'id') .rename(0, 'actor_id') .rename(1, 'participant_id') ) # team_user_actors = ( # etl # .fromcsv(f'{self._dirc}/team_allocation.csv', delimiter=';') # .cut('team_id', 'student_id') # .rename('team_id', 'actor_id') # .convert('actor_id', str) # .rename('student_id', 'participant_id') # .convert('participant_id', str) # ) return etl.cat(student_actors, teacher_actors)
def test_concat(self): tbl1 = self.tbl tbl2 = Table([{'first': 'Mary', 'last': 'Nichols'}]) tbl3 = Table([{'first': 'Lucy', 'last': 'Peterson'}]) tbl1.concat(tbl2, tbl3) expected_tbl = Table(petl.cat(tbl1.table, tbl2.table, tbl3.table)) assert_matching_tables(expected_tbl, tbl1)
def _get_actor_pariticipants(self): # wait to clarify issue in get_actors student_actors = (etl.fromcsv(f'{self._dirc}/student.csv', delimiter=';').cut('id', 'id').rename( 0, 'actor_id').rename( 1, 'participant_id')) teacher_actors = (etl.fromcsv(f'{self._dirc}/teacher.csv', delimiter=';').cut('id', 'id').rename( 0, 'actor_id').rename( 1, 'participant_id')) return etl.cat(student_actors, teacher_actors)
def _get_actors(self): # TODO: Deal with both student_actors = (etl.fromcsv( f'{self._dirc}/all_users.csv', delimiter=';').select(lambda row: row.role == 'STUDENT').cut( 'id').addcolumn('role', [], missing='Student')) teacher_actors = (etl.fromcsv( f'{self._dirc}/all_users.csv', delimiter=';').select(lambda row: row.role == 'TEACHER').cut( 'id').addcolumn('role', [], missing='Teacher')) return etl.cat(student_actors, teacher_actors)
def from_s3_csv(cls, bucket, key, from_manifest=False, aws_access_key_id=None, aws_secret_access_key=None, **csvargs): """ Create a ``parsons table`` from a key in an S3 bucket. `Args:` bucket: str The S3 bucket. key: str The S3 key from_manifest: bool If True, treats `key` as a manifest file and loads all urls into a `parsons.Table`. Defaults to False. aws_access_key_id: str Required if not included as environmental variable. aws_secret_access_key: str Required if not included as environmental variable. \**csvargs: kwargs ``csv_reader`` optional arguments `Returns:` `parsons.Table` object """ # noqa: W605 from parsons.aws import S3 s3 = S3(aws_access_key_id, aws_secret_access_key) if from_manifest: with open(s3.get_file(bucket, key)) as fd: manifest = json.load(fd) s3_keys = [x["url"] for x in manifest["entries"]] else: s3_keys = [f"s3://{bucket}/{key}"] tbls = [] for key in s3_keys: # TODO handle urls that end with '/', i.e. urls that point to "folders" _, _, bucket_, key_ = key.split("/", 3) file_ = s3.get_file(bucket_, key_) if files.compression_type_for_path(key_) == 'zip': file_ = files.zip_archive.unzip_archive(file_) tbls.append(petl.fromcsv(file_, **csvargs)) return cls(petl.cat(*tbls))
def _get_actors(self): student_actors = (etl.fromcsv( f'{self._dirc}/all_users.csv', delimiter=',').select(lambda row: row.role != 'TEACHER').rename( 'user_id', 'id').cut('id').addcolumn('role', [], missing='student')) teacher_actors = (etl.fromcsv( f'{self._dirc}/all_users.csv', delimiter=',').select(lambda row: row.role == 'TEACHER').rename( 'user_id', 'id').cut('id').addcolumn('role', [], missing='instructor')) return etl.cat(student_actors, teacher_actors)
def test_toxlsx_appendxlsx(xlsx_test_table): # setup f = NamedTemporaryFile(delete=True, suffix='.xlsx') f.close() # test toxlsx toxlsx(xlsx_test_table, f.name, 'Sheet1') actual = fromxlsx(f.name, 'Sheet1') ieq(xlsx_test_table, actual) # test appendxlsx appendxlsx(xlsx_test_table, f.name, 'Sheet1') expect = etl.cat(xlsx_test_table, xlsx_test_table) ieq(expect, actual)
def test_appendxlsx_with_non_str_header(xlsx_table_with_non_str_header, xlsx_test_table): f = NamedTemporaryFile(delete=True, suffix='.xlsx') f.close() # write first table toxlsx(xlsx_test_table, f.name, 'Sheet1') actual = fromxlsx(f.name, 'Sheet1') ieq(xlsx_test_table, actual) # test appendxlsx appendxlsx(xlsx_table_with_non_str_header, f.name, 'Sheet1') expect = etl.cat(xlsx_test_table, xlsx_table_with_non_str_header) ieq(expect, actual)
def __init__(self, fasta_path, gff3_path, seqid=None): """ An annotated reference genome. Parameters ---------- fasta_path : string Path to reference genome FASTA file. gff3_path : string Path to genome annotations GFF3 file. """ # store initialisation parameters self._fasta_path = fasta_path self._gff3_path = gff3_path self._seqid = seqid # setup access to reference sequence self._fasta = pyfasta.Fasta(fasta_path) # setup access to GFF3 as a table if isinstance(gff3_path, (list, tuple)): tbl_features = etl.cat(*[etl.fromgff3(p) for p in gff3_path]) else: tbl_features = etl.fromgff3(gff3_path) tbl_features = (tbl_features.unpackdict( 'attributes', ['ID', 'Parent']).rename({ 'ID': 'feature_id', 'Parent': 'parent_id', 'end': 'stop' }).select(lambda row: (row.stop - row.start) > 0)) # limit data to a single chromosome if seqid is not None: tbl_features = tbl_features.eq('seqid', seqid) self._tbl_features = tbl_features.cache() # index features by ID self._idx_feature_id = self._tbl_features.recordlookupone('feature_id') # index features by parent ID self._idx_parent_id = self._tbl_features.recordlookup('parent_id') # index features by genomic location self._idx_location = self._tbl_features.facetintervalrecordlookup( 'seqid', 'start', 'stop', include_stop=True)
def sync(ctx: typer.Context, project: str = typer.Argument( ..., help='The name for the project, specified in config file'), since: datetime = typer.Option(..., formats=['%Y-%m-%d']), until: datetime = typer.Option(..., formats=['%Y-%m-%d']), dry: bool = typer.Option( False, help='Use log entries instead of uploading them to redmine'), drain: bool = typer.Option( False, help='Use drain issues for entries without specified dest')): config = setup_config(ctx, ctx.meta['config_path']) setup_http(ctx) ctx.meta['rdm_user'] = extract.get_redmine_user(config["redmine"]["url"]) time_entries = get_toggl_enteries(config, project, since, until) issues = get_redmine_issues(config, project, since) issue_ids = petl.columns(issues)['id'] entries_to_load, unset_entries = petl.biselect( time_entries, lambda row: row['issue_id'] in issue_ids) if drain and petl.nrows(unset_entries): log.info('Using drain') drained, unset_entries = drained_entries(ctx, issues, unset_entries, project) log.info(f'Drained {petl.nrows(drained)} issues') entries_to_load = petl.cat(entries_to_load, drained) if petl.nrows(unset_entries): log.warning(f'There\'re {petl.nrows(unset_entries)} unset entries') if get_proj_attr(config, project, 'group_entries'): log.info('Using group by day and description') entries_to_load = transform.group_entries_by_day(entries_to_load) load.to_redmine_time(config["redmine"]["url"], entries_to_load, activity_id=get_proj_attr(config, project, 'rdm_activity_id'), user_id=ctx.meta['rdm_user'].get('id'), dry=dry)
def test_toxlsx_appendxlsx(): # setup tbl = (('foo', 'bar'), ('A', 1), ('B', 2), ('C', 2), (u'é', datetime(2012, 1, 1))) f = NamedTemporaryFile(delete=True, suffix='.xlsx') f.close() # test toxlsx toxlsx(tbl, f.name, 'Sheet1') actual = fromxlsx(f.name, 'Sheet1') ieq(tbl, actual) # test appendxlsx appendxlsx(tbl, f.name, 'Sheet1') expect = etl.cat(tbl, tbl) ieq(expect, actual)
def join_tables(filename_pattern_list, output_csv_filebase, add_extra_fields): curr_table = None filenames_list = [] for filename_pattern in filename_pattern_list: for filename in glob.glob(filename_pattern): filenames_list.append(filename) for filename in sorted(set(filenames_list)): if not os.path.isfile(filename): print >> sys.stderr, "*** Error! Cannot open file '" + filename + "'" print >> sys.stderr else: next_table = attendance_file2table(filename, output_csv_filebase, add_extra_fields) if curr_table is not None: curr_table = petl.cat(curr_table, next_table) else: curr_table = next_table return curr_table
def concat(self, *tables, missing=None): """ Concatenates one or more tables onto this one. Note that the tables do not need to share exactly the same fields. Any missing fields will be padded with None, or whatever is provided via the ``missing`` keyword argument. `Args:` tables: Parsons Table or list A single table, or a list of tables missing: bool The value to use when padding missing values `Returns:` ``None`` """ if type(tables) not in [list, tuple]: tables = [tables] petl_tables = [tbl.table for tbl in tables] self.table = petl.cat(self.table, *petl_tables, missing=missing)
print("B only rows: {}".format(b_only)) # Export missing locations to csv if a_only > 0: locs_only_in_a.tocsv('missing_locations_a.csv') else: locs_only_in_b.tocsv('missing_locations_b.csv') # find conflicts between A/B on Chr and Pos columns ab_merge = etl.merge(a_conv, b_conv, key=('Chr', 'Pos')) # magic command for IPython display # ab_merge.display(caption='ab_merge', # td_styles=lambda v: highlight if isinstance(v, etl.Conflict) else '') # Create a new list of all conflicting values ab = etl.cat(a_conv.addfield('source', 'a', index=0), b_conv.addfield('source', 'b', index=0)) ab_conflicts = ab.conflicts(key=('Chr', 'Pos'), exclude='source') # magic command for IPython display # ab_conflicts.display(10) # Highlight specific conflicts ab_conflicts_mut = ab.conflicts(key=('Chr', 'Pos'), include='Mut') # magic command for IPython display # ab_conflicts_mut.display(10, caption='Mut conflicts', # td_styles={'Mut': highlight}) ab_conflict_num = ab_conflicts_mut.nrows() if _DEBUG: print("Total number of A/B conflicts: {}".format(ab_conflict_num))
########## Json extraction and maping tableJ = etl.fromjson('cust_data.json', header=['id','gender','first_name','last_name', 'email','ville']) tableJ = etl.movefield(tableJ, 'gender', 4) ########## CSV extraction and conversion tableCSV = etl.fromcsv('week_cust.csv') tableCSV = etl.convert(tableCSV, 'id', int) ########### Sqlserver connection and extraction connectionSqlServer=pyodbc.connect("Driver={SQL Server Native Client 11.0};" "Server=81_64_msdn;" "Database=BD4client;" "Trusted_Connection=yes;" "convert_unicode =True;") cursor = connectionSqlServer.cursor() cursor.execute('SELECT id, first_name, last_name, email, gender, ville FROM client_DATA') tableSqlServer = cursor.fetchall() tableSqlServer =[('id','first_name','last_name', 'email','gender','ville')]+tableSqlServer cursor.close() connectionSqlServer.close() ######### Staging area transforming and concatenation StagingArea = etl.cat(tableCSV, tableJ,tableSqlServer) StagingArea = etl.convert(StagingArea, 'gender', {'Male': 'M', 'Female': 'F', 'male': 'M', 'female': 'F', None: 'N'}) StagingArea = etl.rename(StagingArea, 'ville', 'city') ######## mysql connection = mysql.connect(host="localhost", user="******", passwd="", db="customerdatabase") curseur = connection.cursor() curseur.execute('SET SQL_MODE=ANSI_QUOTES') #### load data, assuming table " CustomerData" already exists in the database etl.appenddb(StagingArea, connection, 'customerdata', schema='customerdatabase', commit='commit') curseur.close() connection.close()
['D', 'xyz', 9.0], ['E', None]] table5 = [['bar', 'foo'], ['A', 1], ['B', 2]] table7 = [['bar', 'foo'], ['A', 1], ['B', 2]] table8 = [['bar', 'baz'], ['C', True], ['D', False]] from petl import look, cat look(table1) look(table2) table3 = cat(table1, table2) look(table3) # can also be used to square up a single table with uneven rows look(table4) look(cat(table4)) # use the header keyword argument to specify a fixed set of fields look(table5) table6 = cat(table5, header=['A', 'foo', 'B', 'bar', 'C']) look(table6) # using the header keyword argument with two input tables look(table7) look(table8) table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C']) look(table9)
print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield( 'reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV: print(etl.look(dor_condos_unjoined_unmatched)) dor_condos_unjoined_duplicates = etl.antijoin( unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg') print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates)) if DEV: print(etl.look(dor_condos_unjoined_duplicates)) error_table = etl.cat(dor_condos_unjoined_unmatched, dor_condos_unjoined_duplicates) if DEV: print(etl.look(error_table)) # Write to engine db if not DEV: print('Writing condos...') joined.todb(pg_db, 'dor_condominium') print('Writing errors...') error_table.todb(pg_db, 'dor_condominium_error') print("Completed in ", datetime.now() - start, " minutes.")
] table1 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)), "Type", "Red") table2 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)), "Type", "White") #print(etl.head(table1)) #print(etl.head(table2)) table1_filtered = etl.select(table1, "Quality", lambda v: v > 6) table2_filtered = etl.select(table2, "Quality", lambda v: v > 4) good_wines = etl.cat(table1_filtered, table2_filtered) good_wines_enhanced = etl.addfields( good_wines, [("Max Acidity", lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]), ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])]) #print(etl.head(good_wines_enhanced)) #print(etl.tail(good_wines_enhanced)) gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"]) #print(etl.head(gwe_sorted)) print(etl.lookall(etl.tail(gwe_sorted, 500)))
def download(self, urls=[]): # timeout setting for requests # timeout = urllib3.Timeout(connect=2.0, read=7.0) # http = urllib3.PoolManager(timeout=timeout) http = urllib3.PoolManager() report_data = [] for url in urls: # print(url) report_filename = self.get_report_filename( hashlib.md5(url).hexdigest()) if cache: # print('use cache') cache_key = url cache_timeout = CONNECTOR_INFO.get('report_cache_timeout', 60 * 60) z_report = cache.get(cache_key) if z_report is not None: new_report_data = petl.io.fromcsv( petl.MemorySource(zlib.decompress(z_report))) # print(len(new_report_data)) if not report_data: # print('NEw cat') report_data = new_report_data else: report_data = petl.cat(report_data, new_report_data) continue logging.info('Download Report from {}'.format(url)) r = http.request('GET', url, retries=urllib3.Retry( redirect=2, backoff_factor=2, )) if r.status == 200: report = r.data r.release_conn() z_report = zlib.compress(report) cache.set(cache_key, z_report, timeout=cache_timeout) # return petl.io.fromcsv(petl.MemorySource(report)) new_report_data = petl.io.fromcsv( petl.MemorySource(report)) # print(len(new_report_data)) if not report_data: report_data = new_report_data else: report_data = petl.cat(report_data, new_report_data) elif r.status == 403: raise Exception(r.data) else: logging.info(r.data) logging.info(r.status) logging.info(r.headers) else: # move to init # print('Not cache') if not os.path.exists(self.report_folder): os.makedirs(self.report_folder) if not os.path.exists(report_filename): logging.info('Download Report from {}'.format(url)) r = http.request('GET', url, retries=urllib3.Retry( redirect=2, backoff_factor=2, )) if r.status == 200: with open(report_filename, 'wb') as f: f.write(r.data) r.release_conn() logging.info('Read from {}'.format(report_filename)) new_report_data = petl.io.fromcsv(report_filename) if not report_data: report_data = new_report_data else: report_data = petl.cat(report_data, new_report_data) return report_data
def add_rows(table, list_rows_to_add): add_table = petl.fromdicts(list_rows_to_add) return petl.cat(table, add_table)
def prepare_calls(calls_file_path, output_dir, food_needs_user, complex_needs_user, simple_needs_user, call_log_review_user): """Prepares call log records for import""" # Expected file is in 'windows-1252' file encoding spreadsheet = ( etl.fromcsv(calls_file_path, encoding='windows-1252') .rename(rename_map) .select(lambda row: row['latest_attempt_date']) .addfield('import_data', partial(serialize_row, keys=header_map.keys())) .convert('latest_attempt_date', parse_date) .addfield('created_at', lambda row: row['latest_attempt_date']) .addfield('updated_at', lambda row: row['latest_attempt_date']) ) needs_fields = ['nhs_number', 'category', 'name', 'created_at', 'updated_at'] notes_fields = ['nhs_number', 'category', 'body', 'created_at', 'updated_at'] original_triage_needs = ( spreadsheet .addfield('category', 'phone triage') .addfield('name', MSG_ORIGINAL_TRIAGE_NEED) .addfield('completed_on', determine_triage_completion) .cut(*needs_fields, 'completed_on') ) generated_header = ['nhs_number', 'created_at', 'updated_at', 'category'] original_triage_call_notes = ( spreadsheet .selectnotnone('was_contact_made') .rowmapmany(generate_call_notes, header=generated_header) .addfield('body', MSG_CALL_LOG_NOTE) .cut(*notes_fields) ) original_triage_import_notes = ( spreadsheet .addfield('category', 'phone_import') .addfield('body', partial(compose_body, fields=header_map)) .cut(*notes_fields, 'import_data') ) food_needs = ( spreadsheet .select(needs_food) .addfield('category', 'groceries and cooked meals') .convert('food_priority', parse_food_priority) .addfield('supplemental_data', construct_supplemental_data) .addfield('completed_on', determine_food_completion) .addfield('user_id', food_needs_user) .addfield('name', partial(compose_food_need_desc, fields=header_map)) .cut(*needs_fields, 'completed_on', 'supplemental_data', 'user_id') ) callback_needs = ( spreadsheet .convert('callback_date', parse_callback_date) .select(needs_callback) .addfield('category', 'phone triage') .addfield('name', partial(compose_callback_need_desc, fields=header_map)) .addfield('start_on', determine_callback_start_date) .cut(*needs_fields, 'start_on') ) prescription_needs = ( spreadsheet .select(lambda row: row['addl_medication_prescriptions']) .addfield('category', 'prescription pickups') .addfield('name', partial(compose_other_need_desc, fields=header_map)) .addfield('user_id', simple_needs_user) .cut(*needs_fields, 'user_id') ) mental_wellbeing_needs = ( spreadsheet .select(lambda row: row['addl_mental_wellbeing']) .addfield('category', 'physical and mental wellbeing') .addfield('name', partial(compose_other_need_desc, fields=header_map)) .addfield('user_id', complex_needs_user) .cut(*needs_fields, 'user_id') ) financial_needs = ( spreadsheet .select(lambda row: row['addl_financial']) .addfield('category', 'financial support') .addfield('name', partial(compose_other_need_desc, fields=header_map)) .addfield('user_id', complex_needs_user) .cut(*needs_fields, 'user_id') ) other_needs = ( spreadsheet .select(needs_other_support) .addfield('category', 'other') .addfield('name', partial(compose_other_need_desc, fields=header_map)) .addfield('user_id', partial(determine_other_need_user, complex_needs_user=complex_needs_user, simple_needs_user=simple_needs_user, call_log_review_user=call_log_review_user)) .cut(*needs_fields, 'user_id') ) # TODO: prefix with [Import] contact_profile_updates = ( spreadsheet .addfield('additional_info', partial(compose_additional_info, fields=header_map)) .addfield('delivery_details', partial(compose_delivery_details, fields=header_map)) .addfield('dietary_details', compose_dietary_details) .convert('has_covid_symptoms', parse_covid_symptoms) .cut('nhs_number', 'additional_info', 'delivery_details', 'dietary_details', 'has_covid_symptoms') ) # TODO: Improve implementation and readability of QA bits (currently loads all tables into memory) # TODO: Consider adding stats to output somewhere for additional QA lookups = { 'original_triage_needs': original_triage_needs.dictlookupone('nhs_number'), 'original_triage_call_notes': original_triage_call_notes.dictlookup('nhs_number'), # returns list 'food_needs': food_needs.dictlookupone('nhs_number'), 'callback_needs': callback_needs.dictlookupone('nhs_number'), 'remaining_needs': etl.cat(prescription_needs, mental_wellbeing_needs, financial_needs, other_needs).dictlookup('nhs_number') # returns list } quality_assurance = ( spreadsheet .addfield('call_log', partial(compose_body, fields=header_map)) .addfield('original_triage_status', partial(qa_original_triage_status, lookups['original_triage_needs'])) .addfield('original_triage_call_notes', partial(qa_original_triage_call_notes, lookups['original_triage_call_notes'])) .addfield('food_need', partial(qa_food_need, lookups['food_needs'])) .addfield('callback_need', partial(qa_callback_need, lookups['callback_needs'])) .addfield('remaining_needs', partial(qa_remaining_needs, lookups['remaining_needs'])) .cut('nhs_number', 'latest_attempt_date', 'original_triage_status', 'original_triage_call_notes', 'food_need', 'callback_need', 'remaining_needs', 'call_log') ) # Write files quality_assurance.tocsv(join(output_dir, 'quality_assurance.csv')) contact_profile_updates.tocsv(join(output_dir, 'contact_profile_updates.csv')) original_triage_needs.tocsv(join(output_dir, 'original_triage_needs.csv')) etl.cat(original_triage_import_notes, original_triage_call_notes) \ .tocsv(join(output_dir, 'original_triage_notes.csv')) # psql copy meta command hangs when importing fully combined needs file food_needs.tocsv(join(output_dir, 'food_needs.csv')) callback_needs.tocsv(join(output_dir, 'callback_needs.csv')) etl.cat(prescription_needs, mental_wellbeing_needs, financial_needs, other_needs) \ .tocsv(join(output_dir, 'remaining_needs.csv'))
# etl script for marvel character data # input is a csv file # output is a json file (to be loaded into a db) # by ian macfarlane, 2017 import petl as etl from dateutil.parser import parse from Character import Character # load csv data into petl tables hero_table = etl.fromcsv('marvel_heroes.csv') villain_table = etl.fromcsv('marvel_villains.csv') # combine hero and villain tables and join with stats character_table = etl.cat(hero_table, villain_table) new_header_names = { 'IDENTITY': 'identity', 'ALIGN': 'alignment', 'EYE': 'eye', 'HAIR': 'hair', 'SEX': 'sex', 'ALIVE': 'is-alive', 'FIRST APPEARANCE': 'first-appearance', 'AT LARGE': 'is-at-large', 'Year': 'year' } def first_word(s): return s.split(' ', 1)[0].lower() # standard format is "%Y-%m-%dT%H:%M:%S.%fZ"
['E', 12]] table2 = etl.cutout(table1, 'bar') table2 # cat() ####### import petl as etl table1 = [['foo', 'bar'], [1, 'A'], [2, 'B']] table2 = [['bar', 'baz'], ['C', True], ['D', False]] table3 = etl.cat(table1, table2) table3 # can also be used to square up a single table with uneven rows table4 = [['foo', 'bar', 'baz'], ['A', 1, 2], ['B', '2', '3.4'], [u'B', u'3', u'7.8', True], ['D', 'xyz', 9.0], ['E', None]] table5 = etl.cat(table4) table5 # use the header keyword argument to specify a fixed set of fields table6 = [['bar', 'foo'], ['A', 1], ['B', 2]] table7 = etl.cat(table6, header=['A', 'foo', 'B', 'bar', 'C'])
print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV: print(etl.look(dor_condos_unjoined_unmatched)) dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg') print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates)) if DEV: print(etl.look(dor_condos_unjoined_duplicates)) error_table = etl.cat(dor_condos_unjoined_unmatched, dor_condos_unjoined_duplicates) if DEV: print(etl.look(error_table)) # Write to engine db if not DEV: print('Writing condos...') joined.todb(pg_db, 'dor_condominium') print('Writing errors...') error_table.todb(pg_db, 'dor_condominium_error') print("Completed in ", datetime.now() - start, " minutes.")