def test_create_read_delete_table(self): create_table_col_map = { 'a': 'NVARCHAR(100)', 'b': 'DATETIME', 'c': 'BIT', 'd': 'INT' } table_nm = 'zzz_test' schema = 'dbo' with lad.DbHandler('LA_DB') as db: db.create_table(table_nm, schema, create_table_col_map) information_schema_df_1 = db.sql_to_df(query=""" SELECT * FROM INFORMATION_SCHEMA.COLUMNS c WHERE c.TABLE_SCHEMA = '{0}' AND c.TABLE_NAME = '{1}' """.format(schema, table_nm)) self.assertEqual(len(create_table_col_map), len(information_schema_df_1)) db.delete_table(table_nm, schema) information_schema_df_2 = db.sql_to_df(query=""" SELECT * FROM INFORMATION_SCHEMA.COLUMNS c WHERE c.TABLE_SCHEMA = '{0}' AND c.TABLE_NAME = '{1}' """.format(schema, table_nm)) self.assertEqual(0, len(information_schema_df_2))
def test_sql_to_df_with_index(self): test_table = 'zzz_test' test_schema = 'dbo' create_table_col_map = { 'a': 'NVARCHAR(100)', 'b': 'NVARCHAR(100)', 'c': 'BIT', 'd': 'INT' } df = pd.DataFrame( data={ 'a': ['a', 'b', 'c', 'd'], 'b': ['w', 'x', 'y', 'z'], 'c': [True, True, False, False], 'd': [10, 11, 12, 13] }) self._setup_table_with_data(test_table, test_schema, create_table_col_map, df) with lad.DbHandler('LA_DB') as db: df = db.sql_to_df(query=""" SELECT * FROM {0}.{1} """.format(test_schema, test_table), index_col='a') self.assertEqual(df.index.name, 'a') self._teardown_table(test_table, test_schema)
def test_delete_all_rows(self): test_table = 'zzz_test' test_schema = 'dbo' create_table_col_map = { 'a': 'NVARCHAR(100)', 'b': 'NVARCHAR(100)', 'c': 'BIT', 'd': 'INT' } df = pd.DataFrame( data={ 'a': ['a', 'b', 'c', 'd', None], 'b': ['w', 'x', 'y', 'z', 'p'], 'c': [True, True, False, False, True], 'd': [10, 11, 12, 13, 14] }) self._setup_table_with_data(test_table, test_schema, create_table_col_map, df) with lad.DbHandler('LA_DB') as db: db.delete_rows(test_table, test_schema) row_count = db.sql_to_df(query=""" SELECT COUNT(*) FROM {0}.{1} """.format(test_schema, test_table)).iloc[0, 0] self.assertEqual(0, row_count) self._teardown_table(test_table, test_schema)
def test_df_to_db_no_append(self): test_table = 'zzz_test' test_schema = 'dbo' create_table_col_map = { 'a': 'NVARCHAR(100)', 'b': 'NVARCHAR(100)', 'c': 'BIT', 'd': 'INT' } df = pd.DataFrame( data={ 'a': ['a', 'b', 'c', 'd'], 'b': ['w', 'x', 'y', 'z'], 'c': [True, True, False, False], 'd': [10, 11, 12, 13] }) self._setup_table_with_data(test_table, test_schema, create_table_col_map, df) new_data_df = pd.DataFrame(data={'a': ['zzz', 'xxx', 'yyy']}) with lad.DbHandler('LA_DB') as db: db.df_to_table(new_data_df, test_table, test_schema, append=False) new_table_size = db.sql_to_df(query=""" SELECT COUNT(*) FROM {0}.{1} """.format(test_schema, test_table)).iloc[0, 0] self.assertEqual(len(new_data_df), new_table_size) self._teardown_table(test_table, test_schema)
def test_update(self): test_table = 'zzz_test' test_schema = 'dbo' create_table_col_map = { 'a': 'NVARCHAR(100)', 'b': 'NVARCHAR(100)', 'c': 'BIT', 'd': 'INT' } df = pd.DataFrame( data={ 'a': ['a', 'b', 'c', 'd'], 'b': [ 'w', 'x', 'y', 'z', ], 'c': [True, True, False, False], 'd': [ 10, 11, 12, 13, ] }) self._setup_table_with_data(test_table, test_schema, create_table_col_map, df) update_df = pd.DataFrame(data={'a': ['a', 'b'], 'd': [999, 999]}) exp_df = df.assign(**{'d': [ 999, 999, 12, 13, ]}) with lad.DbHandler('LA_DB') as db: db.update_table(test_table, test_schema, update_df, 'a') got_df = db.sql_to_df(query=""" SELECT a, b, c, d FROM {0}.{1} ORDER BY a """.format(test_schema, test_table)) pd.testing.assert_frame_equal(exp_df, got_df) self._teardown_table(test_table, test_schema)
def get_since_id(twitter, tweet_table, search_string): # check to see if we have any tweets in the DB already with lad.DbHandler('LA_DB') as db: min_db_tweet_id = db.sql_to_df( query=""" SELECT MIN(t.id) FROM {} t """.format(tweet_table) ).iloc[0, 0] if not min_db_tweet_id: print('No tweets in db, searching for since_id') # if we don't have any tweets in our table, we need to find the # youngest tweet older than 2 hours since_id = find_youngest_tweet_older_than(twitter, search_string, 2) else: # for this job, I want to get updates to tweets we've already stored, # so since_id is smallest tweet ID we have print('Found tweets in db, using oldest as since_id') since_id = min_db_tweet_id return since_id
def _setup_table_with_data(self, table_nm, schema, create_table_col_map, df): with lad.DbHandler('LA_DB') as db: db.create_table(table_nm, schema, create_table_col_map) db.df_to_table(df, table_nm, schema, append=False)
def _teardown_table(self, table_nm, schema): with lad.DbHandler('LA_DB') as db: db.delete_table(table_nm, schema)
def process_df(df, dest_table, update_if_diff, audit_table=None, max_id=None, update_with_older_data_col=None): if df.empty: return with lad.DbHandler('LA_DB') as db: existing_data_df = db.sql_to_df( query=""" SELECT * FROM {0} """.format(dest_table), index_col='id' ) #insert new rows to table and audit table new_rows_df = df[~df.index.isin(existing_data_df.index)] new_rows_df.reset_index(inplace=True) db.df_to_table( new_rows_df, table=dest_table.split('.')[1], schema=dest_table.split('.')[0], append=True ) if audit_table: #insert new rows to audit table db.df_to_table( new_rows_df, table=audit_table.split('.')[1], schema=audit_table.split('.')[0], append=True ) if update_if_diff: if max_id and update_with_older_data_col: # if this isn't our first search on this run, don't update newer user data with older user data updateable_existing_data = existing_data_df[existing_data_df[update_with_older_data_col]] else: updateable_existing_data = existing_data_df out_col = 'is_diff' #compare new data to existing data diff_flag_df = lad.diff_rows( df, updateable_existing_data, out_col=out_col, exclude_col_patterns=['created', 'modified', 'update_with_older', 'search_id'] ) # only look at new data that is different from existing data diff_df = diff_flag_df[diff_flag_df[out_col]] if diff_df.empty: return print('Updating {} rows'.format(str(len(diff_df)))) #make a copy before we update for audit purposes audit_df = diff_df.copy() # update new data with cols from old data we want to preserve diff_df.update(updateable_existing_data[[ 'etl_created_date','etl_created_by' ]]) # update diff_df.drop(out_col, axis=1, inplace=True) diff_df.reset_index(inplace=True) db.update_table( table=dest_table.split('.')[1], schema=dest_table.split('.')[0], update_df=diff_df,join_col='id' ) if audit_table: #inser updated versions to audit table audit_df.drop(out_col, axis=1, inplace=True) audit_df.reset_index(inplace=True) db.df_to_table( audit_df, table=audit_table.split('.')[1], schema=audit_table.split('.')[0], append=True )
def test_process_df_diff_ignore_cols(self): test_table = 'zzz_test' test_schema = 'dbo' create_table_col_map = { 'id': 'NVARCHAR(100)', 'b': 'NVARCHAR(100)', 'c': 'BIT', 'd': 'INT', 'etl_created_date': 'NVARCHAR(100)', 'etl_created_by': 'NVARCHAR(100)', } df = pd.DataFrame( data={ 'id': ['a', 'b', 'c', 'd', 'e'], 'b': ['w', 'x', 'y', 'z', 'p'], 'c': [True, True, False, False, True], 'd': [10, 11, 12, 13, 14], 'etl_created_date': ['123', '345', '456', '67', '6756'], 'etl_created_by': ['123', '345', '456', '67', '6756'] }) self._setup_table_with_data(test_table, test_schema, create_table_col_map, df) new_data = pd.DataFrame( data={ 'id': ['a', 'x', 'y', 'z'], 'b': [ 'different w', 'x', 'y', 'z', ], 'c': [ True, True, False, False, ], 'd': [ 10, 11, 12, 13, ], 'etl_created_date': [ 'different 123', '345', '456', '67', ], 'etl_created_by': [ 'different 123', '345', '456', '67', ] }) new_data.set_index('id', inplace=True) process_df(new_data, test_schema + '.' + test_table, update_if_diff=True) with lad.DbHandler('LA_DB') as db: resulting_df = db.sql_to_df(query=""" SELECT * FROM {0}.{1} """.format(test_schema, test_table)) self.assertEqual(len(resulting_df), 8) self.assertEqual(len(resulting_df.columns), 6) self.assertEqual(resulting_df.loc[resulting_df['id'] == 'a', 'b'][0], 'different w') self.assertEqual( resulting_df.loc[resulting_df['id'] == 'a', 'etl_created_date'][0], '123') self.assertEqual( resulting_df.loc[resulting_df['id'] == 'a', 'etl_created_by'][0], '123') self._teardown_table(test_table, test_schema)
def test_process_df_audit(self): test_table = 'zzz_test' test_audit_table = 'zzz_test_audit' test_schema = 'dbo' create_table_col_map = { 'id': 'NVARCHAR(100)', 'b': 'NVARCHAR(100)', 'c': 'BIT', 'd': 'INT', 'etl_created_date': 'NVARCHAR(100)', 'etl_created_by': 'NVARCHAR(100)', } df = pd.DataFrame( data={ 'id': ['a', 'b', 'c', 'd', 'e'], 'b': ['w', 'x', 'y', 'z', 'p'], 'c': [True, True, False, False, True], 'd': [10, 11, 12, 13, 14], 'etl_created_date': ['123', '345', '456', '67', '6756'], 'etl_created_by': ['123', '345', '456', '67', '6756'] }) self._setup_table_with_data(test_table, test_schema, create_table_col_map, df) self._setup_table_without_data(test_audit_table, test_schema, create_table_col_map) new_data = pd.DataFrame( data={ 'id': ['a', 'x', 'y', 'z', 'b'], 'b': ['different w', 'x', 'y', 'z', 'x'], 'c': [True, True, False, False, True], 'd': [10, 11, 12, 13, 11], 'etl_created_date': ['different 123', '345', '456', '67', '345'], 'etl_created_by': ['different 123', '345', '456', '67', '345'] }) new_data.set_index('id', inplace=True) process_df(new_data.copy(), test_schema + '.' + test_table, update_if_diff=True, audit_table=test_schema + '.' + test_audit_table) with lad.DbHandler('LA_DB') as db: resulting_audit_df = db.sql_to_df(query=""" SELECT * FROM {0}.{1} """.format(test_schema, test_audit_table)) #all new and different rows should get inserted to audit table # id 'b' is exactly the same so shouldn't get inserted expected_audit_df = new_data.reset_index() # have to do a bunch of stuff to get the two dfs in the same order expected_audit_df = expected_audit_df[expected_audit_df['id'] != 'b'] expected_audit_df = expected_audit_df.sort_values('id') expected_audit_df = expected_audit_df.reindex_axis(sorted( expected_audit_df.columns), axis=1) expected_audit_df = expected_audit_df.reset_index() expected_audit_df = expected_audit_df.drop('index', axis=1) resulting_audit_df = resulting_audit_df.sort_values('id') resulting_audit_df = resulting_audit_df.reindex_axis(sorted( resulting_audit_df.columns), axis=1) resulting_audit_df = resulting_audit_df.reset_index() resulting_audit_df = resulting_audit_df.drop('index', axis=1) pd.testing.assert_frame_equal(expected_audit_df, resulting_audit_df) self._teardown_table(test_table, test_schema) self._teardown_table(test_audit_table, test_schema)
def _setup_table_without_data(self, table_nm, schema, create_table_col_map): with lad.DbHandler('LA_DB') as db: db.create_table(table_nm, schema, create_table_col_map)