def createDimMedium(events): try: dim_medium_cut = etl.cut(events, 'utm_medium') dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'}) dim_medium = etl.distinct(dim_medium_rename) # Export as csv to load folder etl.tocsv(dim_medium, 'load/dim_medium.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def createDimCampaignType(events): try: dim_campaigntype_cut = etl.cut(events, 'utm_campaign') dim_campaigntype_rename = etl.rename(dim_campaigntype_cut, {'utm_campaign': 'campaign_type'}) dim_campaigntype = etl.distinct(dim_campaigntype_rename) # export as csv to load folder etl.tocsv(dim_campaigntype, 'load/dim_campaigntype.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def createDimSubscriptions(events): try: dim_subscriptions_cut = etl.cut(events, 'type') dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'}) dim_subscriptions = etl.distinct(dim_subscriptions_rename) # Export as csv to load folder etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def get_relationships(self): "Parses a list of `Relationship` objects." core_file = _find_loinc_table_core_file(self.uri.path) core = etl.fromcsv(core_file, delimiter=',') core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME']) hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path) hierarchy = etl.fromcsv(hierarchy_file, delimiter=',') hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM') hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME']) hierarchy = etl.fillright(hierarchy) hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME']) hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT') parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT']) hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '') hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.') hierarchy = etl.distinct(hierarchy) if self.versioned: version = _parse_version(hierarchy_file) hierarchy = etl.addfield(hierarchy, 'version', version) hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship']) return hierarchy
# print(directorsAndGenresAndMovies) # print(actorsAndRoles) # print(moviesAndGenresAndDirectorsAndRoles) # Delete unnecessary columns from all tables ranks = etl.cut(moviesAndDirectorsAndRoles, 'movie_id', 'rank', 'director_id', 'actor_id') movies = etl.cut(moviesAndDirectorsAndRoles, 'movie_id', 'name') directors = etl.cut(directorsAndMovies, 'id', 'full_name') actors = etl.cut(actors, 'id', 'full_name') # Rename id to include table name directors = etl.rename(directors, 'id', 'director_id') actors = etl.rename(actors, 'id', 'actor_id') # Remove rows with NULL ranks ranks = etl.distinct(ranks) ranks = etl.selectnotnone(ranks, 'rank') # Remove duplicates after cutting columns movies = etl.distinct(movies) directors = etl.distinct(directors) actors = etl.distinct(actors) # Insert final tables into data warehouse etl.todb(ranks, imdbWarehouse, 'ranks') etl.todb(movies, imdbWarehouse, 'movies') etl.todb(actors, imdbWarehouse, 'actors') etl.todb(directors, imdbWarehouse, 'directors')
def append_tailings_reports_to_code_required_reports(connection, commit=False): src_table = etl.fromdb( connection, 'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \ inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\ inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid' ) req_document_crr_defintion_map = [ ['req_document_name', 'mine_report_definition_id'], ['Summary of TSF and Dam Safety Recommendations', 28], ['ITRB Activities Report', 27], ['Register of Tailings Storage Facilities and Dams', 47], ['Dam Safety Inspection (DSI) Report', 26], ['Dam Safety Review (DSR) Report', 31], ['“As-built” Reports', 32], ['Annual Reclamation', 25], ['MERP Record of Testing', 3], #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore. ['OMS Manual', 33], ['Annual reconciliation of water balance and water management plans', 44], ['TSF risk assessment', 46], ['Mine Emergency Preparedness and Response Plan (MERP)', 24], ['Performance of high risk dumps', 29] ] table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name') mine_report = etl.cutout(table1, 'req_document_name') #to be inserted into db mine_report = etl.addfield(mine_report, 'submission_year', 2019) mine_report = etl.rename(mine_report, 'exp_document_status_code', 'mine_report_submission_status_code') mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind) mine_report = etl.cutout(mine_report, 'active_ind') #to determine what FK's will be so can insert into related tables max_report_id = etl.fromdb(connection, 'select last_value from public.mine_report_mine_report_id_seq')[1][0] max_report_submission_id = etl.fromdb( connection, 'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0] #if sequence hasn't been used yet, fix off by one if max_report_id == 1: max_report_id = 0 if max_report_submission_id == 1: max_report_submission_id = 0 #get one-to-many mine_report, mine_report_submission_documents = etl.unjoin(mine_report, 'mine_document_guid', key='exp_document_guid') #add PK's for mappings mine_report_with_ids = etl.addrownumbers(mine_report, start=max_report_id + 1, step=1, field='mine_report_id') mine_report_with_ids = etl.addrownumbers(mine_report_with_ids, start=max_report_submission_id + 1, step=1, field='mine_report_submission_id') print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}') #copy out fields for submission tables mine_report_submissions = etl.cut(mine_report_with_ids, [ 'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user', 'create_timestamp', 'update_user', 'update_timestamp' ]) mine_report_submissions = etl.addfield(mine_report_submissions, 'submission_date', lambda x: x.create_timestamp) #remove fields not in mine_report mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code') #replace exp_document_guid FK with mine_report_submission FK submission_id_lookup = etl.cut(mine_report_with_ids, ['mine_report_submission_id', 'exp_document_guid']) mine_report_submission_documents = etl.join(submission_id_lookup, mine_report_submission_documents, key='exp_document_guid') mine_report_submission_documents = etl.cutout(mine_report_submission_documents, 'exp_document_guid') #removed original PK mine_report = etl.cutout(mine_report, 'exp_document_guid') mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid') print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name')) print(etl.valuecounter(mine_report, 'mine_report_definition_id')) print(table1) print(mine_report) print(mine_report_submissions) print(mine_report_submission_documents) etl.appenddb(mine_report, connection, 'mine_report', commit=False) print('INSERT mine_report staged') etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False) print('INSERT mine_report_submission staged') etl.appenddb(mine_report_submission_documents, connection, 'mine_report_document_xref', commit=False) print('INSERT mine_report_document_xref staged') if commit: connection.commit() print('DATA CREATION COMPLETE') else: connection.rollback() print('NO DATA CREATED: add --commit=true to insert report rows')
def distinct_execute(c, **kwargs): return etl.distinct(c())
users = etl.fromcsv('data/users.csv') events = etl.fromjson('data/events2.json') # Transform # Dim Customers # Filter necessary data only dim_customers = etl.cut(users, 'user_id', 'email') # Export as csv to load folder etl.tocsv(dim_customers, 'load/dim_customers.csv') # Dim Subscriptions # Use the distinct values present in the type column to load into the dim subscription table dim_subscriptions_cut = etl.cut(events, 'type') dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'}) dim_subscriptions = etl.distinct(dim_subscriptions_rename) # Export as csv to load folder etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv') # Dim Medium # Use the distinct values present in the utm_medium colum to load into the dim medium table dim_medium_cut = etl.cut(events, 'utm_medium') dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'}) dim_medium = etl.distinct(dim_medium_rename) # Export as csv to load folder etl.tocsv(dim_medium, 'load/dim_medium.csv') # Dim Campaign Type # Use the distinct values present in the utm_campaign column to load into the dim campaign table # Note: # If this is the only available data right now, this in the future will probably be the source and not campaign
# EXTRACT n_table = etl.fromdb(data_in, 'SELECT * FROM Names') m_table = etl.fromdb(data_in, 'SELECT * FROM Movies') r_table = etl.fromdb(data_in, 'SELECT * FROM Ratings') tp_table = etl.fromdb(data_in, 'SELECT * FROM Title_principals') print("Extract DONE") # TRANSFORM # movie personnel d_movie_personnel = etl.cut(n_table, 'imdb_name_id', 'name', 'birth_name') # title d_title = etl.cut(m_table, 'imdb_title_id', 'title', 'original_title') # genre d_genre = etl.distinct(etl.cut(m_table, 'genre')) rows = etl.nrows(d_genre) generated =[] # print(rows) for i in range(rows): uuid = out_cursor.execute('SELECT UUID();') uuid = out_cursor.fetchone()[0] generated.append(uuid) d_genre = etl.addcolumn(d_genre, 'genre_id', generated) # date d_date = etl.distinct(etl.cut(m_table, 'year', 'date_published')) rows = etl.nrows(d_date) generated =[] for i in range(rows): uuid = out_cursor.execute('SELECT UUID();')