Beispiel #1
0
    def test_create_read_delete_table(self):
        create_table_col_map = {
            'a': 'NVARCHAR(100)',
            'b': 'DATETIME',
            'c': 'BIT',
            'd': 'INT'
        }

        table_nm = 'zzz_test'
        schema = 'dbo'

        with lad.DbHandler('LA_DB') as db:
            db.create_table(table_nm, schema, create_table_col_map)

            information_schema_df_1 = db.sql_to_df(query="""
                    SELECT *
                    FROM INFORMATION_SCHEMA.COLUMNS c
                    WHERE c.TABLE_SCHEMA = '{0}'
                        AND c.TABLE_NAME = '{1}'
                 """.format(schema, table_nm))

            self.assertEqual(len(create_table_col_map),
                             len(information_schema_df_1))

            db.delete_table(table_nm, schema)

            information_schema_df_2 = db.sql_to_df(query="""
                     SELECT *
                     FROM INFORMATION_SCHEMA.COLUMNS c
                     WHERE c.TABLE_SCHEMA = '{0}'
                         AND c.TABLE_NAME = '{1}'
                  """.format(schema, table_nm))

            self.assertEqual(0, len(information_schema_df_2))
Beispiel #2
0
    def test_sql_to_df_with_index(self):
        test_table = 'zzz_test'
        test_schema = 'dbo'
        create_table_col_map = {
            'a': 'NVARCHAR(100)',
            'b': 'NVARCHAR(100)',
            'c': 'BIT',
            'd': 'INT'
        }
        df = pd.DataFrame(
            data={
                'a': ['a', 'b', 'c', 'd'],
                'b': ['w', 'x', 'y', 'z'],
                'c': [True, True, False, False],
                'd': [10, 11, 12, 13]
            })
        self._setup_table_with_data(test_table, test_schema,
                                    create_table_col_map, df)
        with lad.DbHandler('LA_DB') as db:
            df = db.sql_to_df(query="""
                    SELECT *
                    FROM {0}.{1}
                """.format(test_schema, test_table),
                              index_col='a')

        self.assertEqual(df.index.name, 'a')

        self._teardown_table(test_table, test_schema)
Beispiel #3
0
    def test_delete_all_rows(self):
        test_table = 'zzz_test'
        test_schema = 'dbo'
        create_table_col_map = {
            'a': 'NVARCHAR(100)',
            'b': 'NVARCHAR(100)',
            'c': 'BIT',
            'd': 'INT'
        }
        df = pd.DataFrame(
            data={
                'a': ['a', 'b', 'c', 'd', None],
                'b': ['w', 'x', 'y', 'z', 'p'],
                'c': [True, True, False, False, True],
                'd': [10, 11, 12, 13, 14]
            })
        self._setup_table_with_data(test_table, test_schema,
                                    create_table_col_map, df)
        with lad.DbHandler('LA_DB') as db:
            db.delete_rows(test_table, test_schema)
            row_count = db.sql_to_df(query="""
                    SELECT COUNT(*)
                    FROM {0}.{1}
                """.format(test_schema, test_table)).iloc[0, 0]

        self.assertEqual(0, row_count)
        self._teardown_table(test_table, test_schema)
Beispiel #4
0
    def test_df_to_db_no_append(self):
        test_table = 'zzz_test'
        test_schema = 'dbo'
        create_table_col_map = {
            'a': 'NVARCHAR(100)',
            'b': 'NVARCHAR(100)',
            'c': 'BIT',
            'd': 'INT'
        }
        df = pd.DataFrame(
            data={
                'a': ['a', 'b', 'c', 'd'],
                'b': ['w', 'x', 'y', 'z'],
                'c': [True, True, False, False],
                'd': [10, 11, 12, 13]
            })
        self._setup_table_with_data(test_table, test_schema,
                                    create_table_col_map, df)

        new_data_df = pd.DataFrame(data={'a': ['zzz', 'xxx', 'yyy']})

        with lad.DbHandler('LA_DB') as db:
            db.df_to_table(new_data_df, test_table, test_schema, append=False)
            new_table_size = db.sql_to_df(query="""
                    SELECT COUNT(*)
                    FROM {0}.{1}
                """.format(test_schema, test_table)).iloc[0, 0]

            self.assertEqual(len(new_data_df), new_table_size)

        self._teardown_table(test_table, test_schema)
Beispiel #5
0
    def test_update(self):
        test_table = 'zzz_test'
        test_schema = 'dbo'
        create_table_col_map = {
            'a': 'NVARCHAR(100)',
            'b': 'NVARCHAR(100)',
            'c': 'BIT',
            'd': 'INT'
        }
        df = pd.DataFrame(
            data={
                'a': ['a', 'b', 'c', 'd'],
                'b': [
                    'w',
                    'x',
                    'y',
                    'z',
                ],
                'c': [True, True, False, False],
                'd': [
                    10,
                    11,
                    12,
                    13,
                ]
            })
        self._setup_table_with_data(test_table, test_schema,
                                    create_table_col_map, df)

        update_df = pd.DataFrame(data={'a': ['a', 'b'], 'd': [999, 999]})
        exp_df = df.assign(**{'d': [
            999,
            999,
            12,
            13,
        ]})

        with lad.DbHandler('LA_DB') as db:
            db.update_table(test_table, test_schema, update_df, 'a')
            got_df = db.sql_to_df(query="""
                    SELECT a, b, c, d
                    FROM {0}.{1}
                    ORDER BY a
                """.format(test_schema, test_table))

        pd.testing.assert_frame_equal(exp_df, got_df)

        self._teardown_table(test_table, test_schema)
def get_since_id(twitter, tweet_table, search_string):
    # check to see if we have any tweets in the DB already
    with lad.DbHandler('LA_DB') as db:
        min_db_tweet_id = db.sql_to_df(
            query="""
                   SELECT MIN(t.id)
                   FROM {} t
               """.format(tweet_table)
        ).iloc[0, 0]

    if not min_db_tweet_id:
        print('No tweets in db, searching for since_id')
        # if we don't have any tweets in our table, we need to find the
        # youngest tweet older than 2 hours
        since_id = find_youngest_tweet_older_than(twitter, search_string, 2)
    else:
        # for this job, I want to get updates to tweets we've already stored,
        #  so since_id is smallest tweet ID we have
        print('Found tweets in db, using oldest as since_id')
        since_id = min_db_tweet_id

    return since_id
Beispiel #7
0
    def _setup_table_with_data(self, table_nm, schema, create_table_col_map,
                               df):

        with lad.DbHandler('LA_DB') as db:
            db.create_table(table_nm, schema, create_table_col_map)
            db.df_to_table(df, table_nm, schema, append=False)
Beispiel #8
0
 def _teardown_table(self, table_nm, schema):
     with lad.DbHandler('LA_DB') as db:
         db.delete_table(table_nm, schema)
def process_df(df, dest_table, update_if_diff, audit_table=None, max_id=None, update_with_older_data_col=None):
    if df.empty:
        return
    with lad.DbHandler('LA_DB') as db:
        existing_data_df = db.sql_to_df(
            query="""
                SELECT *
                FROM {0}
            """.format(dest_table),
            index_col='id'
        )
        #insert new rows to table and audit table
        new_rows_df = df[~df.index.isin(existing_data_df.index)]
        new_rows_df.reset_index(inplace=True)
        db.df_to_table(
            new_rows_df, table=dest_table.split('.')[1],
            schema=dest_table.split('.')[0], append=True
        )
        if audit_table:
            #insert new rows to audit table
            db.df_to_table(
                new_rows_df, table=audit_table.split('.')[1],
                schema=audit_table.split('.')[0], append=True
            )

        if update_if_diff:
            if max_id and update_with_older_data_col:
                # if this isn't our first search on this run, don't update newer user data with older user data
                updateable_existing_data = existing_data_df[existing_data_df[update_with_older_data_col]]
            else:
                updateable_existing_data = existing_data_df
            out_col = 'is_diff'
            #compare new data to existing data
            diff_flag_df = lad.diff_rows(
                df, updateable_existing_data, out_col=out_col,
                exclude_col_patterns=['created', 'modified', 'update_with_older', 'search_id']
            )
            # only look at new data that is different from existing data
            diff_df = diff_flag_df[diff_flag_df[out_col]]
            if diff_df.empty:
                return
            print('Updating {} rows'.format(str(len(diff_df))))
            #make a copy before we update for audit purposes
            audit_df = diff_df.copy()
            # update new data with cols from old data we want to preserve
            diff_df.update(updateable_existing_data[[
                'etl_created_date','etl_created_by'
            ]])
            # update
            diff_df.drop(out_col, axis=1, inplace=True)
            diff_df.reset_index(inplace=True)
            db.update_table(
                table=dest_table.split('.')[1], schema=dest_table.split('.')[0], update_df=diff_df,join_col='id'
            )
            if audit_table:
                #inser updated versions to audit table
                audit_df.drop(out_col, axis=1, inplace=True)
                audit_df.reset_index(inplace=True)
                db.df_to_table(
                    audit_df, table=audit_table.split('.')[1],
                    schema=audit_table.split('.')[0], append=True
                )
Beispiel #10
0
    def test_process_df_diff_ignore_cols(self):
        test_table = 'zzz_test'
        test_schema = 'dbo'
        create_table_col_map = {
            'id': 'NVARCHAR(100)',
            'b': 'NVARCHAR(100)',
            'c': 'BIT',
            'd': 'INT',
            'etl_created_date': 'NVARCHAR(100)',
            'etl_created_by': 'NVARCHAR(100)',
        }
        df = pd.DataFrame(
            data={
                'id': ['a', 'b', 'c', 'd', 'e'],
                'b': ['w', 'x', 'y', 'z', 'p'],
                'c': [True, True, False, False, True],
                'd': [10, 11, 12, 13, 14],
                'etl_created_date': ['123', '345', '456', '67', '6756'],
                'etl_created_by': ['123', '345', '456', '67', '6756']
            })
        self._setup_table_with_data(test_table, test_schema,
                                    create_table_col_map, df)

        new_data = pd.DataFrame(
            data={
                'id': ['a', 'x', 'y', 'z'],
                'b': [
                    'different w',
                    'x',
                    'y',
                    'z',
                ],
                'c': [
                    True,
                    True,
                    False,
                    False,
                ],
                'd': [
                    10,
                    11,
                    12,
                    13,
                ],
                'etl_created_date': [
                    'different 123',
                    '345',
                    '456',
                    '67',
                ],
                'etl_created_by': [
                    'different 123',
                    '345',
                    '456',
                    '67',
                ]
            })
        new_data.set_index('id', inplace=True)

        process_df(new_data,
                   test_schema + '.' + test_table,
                   update_if_diff=True)

        with lad.DbHandler('LA_DB') as db:
            resulting_df = db.sql_to_df(query="""
                            SELECT *
                            FROM {0}.{1}
                        """.format(test_schema, test_table))

        self.assertEqual(len(resulting_df), 8)
        self.assertEqual(len(resulting_df.columns), 6)
        self.assertEqual(resulting_df.loc[resulting_df['id'] == 'a', 'b'][0],
                         'different w')
        self.assertEqual(
            resulting_df.loc[resulting_df['id'] == 'a', 'etl_created_date'][0],
            '123')
        self.assertEqual(
            resulting_df.loc[resulting_df['id'] == 'a', 'etl_created_by'][0],
            '123')
        self._teardown_table(test_table, test_schema)
Beispiel #11
0
    def test_process_df_audit(self):
        test_table = 'zzz_test'
        test_audit_table = 'zzz_test_audit'
        test_schema = 'dbo'
        create_table_col_map = {
            'id': 'NVARCHAR(100)',
            'b': 'NVARCHAR(100)',
            'c': 'BIT',
            'd': 'INT',
            'etl_created_date': 'NVARCHAR(100)',
            'etl_created_by': 'NVARCHAR(100)',
        }
        df = pd.DataFrame(
            data={
                'id': ['a', 'b', 'c', 'd', 'e'],
                'b': ['w', 'x', 'y', 'z', 'p'],
                'c': [True, True, False, False, True],
                'd': [10, 11, 12, 13, 14],
                'etl_created_date': ['123', '345', '456', '67', '6756'],
                'etl_created_by': ['123', '345', '456', '67', '6756']
            })
        self._setup_table_with_data(test_table, test_schema,
                                    create_table_col_map, df)
        self._setup_table_without_data(test_audit_table, test_schema,
                                       create_table_col_map)

        new_data = pd.DataFrame(
            data={
                'id': ['a', 'x', 'y', 'z', 'b'],
                'b': ['different w', 'x', 'y', 'z', 'x'],
                'c': [True, True, False, False, True],
                'd': [10, 11, 12, 13, 11],
                'etl_created_date':
                ['different 123', '345', '456', '67', '345'],
                'etl_created_by': ['different 123', '345', '456', '67', '345']
            })
        new_data.set_index('id', inplace=True)

        process_df(new_data.copy(),
                   test_schema + '.' + test_table,
                   update_if_diff=True,
                   audit_table=test_schema + '.' + test_audit_table)

        with lad.DbHandler('LA_DB') as db:
            resulting_audit_df = db.sql_to_df(query="""
                    SELECT *
                    FROM {0}.{1}
                """.format(test_schema, test_audit_table))

        #all new and different rows should get inserted to audit table
        # id 'b' is exactly the same so shouldn't get inserted
        expected_audit_df = new_data.reset_index()

        # have to do a bunch of stuff to get the two dfs in the same order
        expected_audit_df = expected_audit_df[expected_audit_df['id'] != 'b']
        expected_audit_df = expected_audit_df.sort_values('id')
        expected_audit_df = expected_audit_df.reindex_axis(sorted(
            expected_audit_df.columns),
                                                           axis=1)
        expected_audit_df = expected_audit_df.reset_index()
        expected_audit_df = expected_audit_df.drop('index', axis=1)

        resulting_audit_df = resulting_audit_df.sort_values('id')
        resulting_audit_df = resulting_audit_df.reindex_axis(sorted(
            resulting_audit_df.columns),
                                                             axis=1)
        resulting_audit_df = resulting_audit_df.reset_index()
        resulting_audit_df = resulting_audit_df.drop('index', axis=1)

        pd.testing.assert_frame_equal(expected_audit_df, resulting_audit_df)
        self._teardown_table(test_table, test_schema)
        self._teardown_table(test_audit_table, test_schema)
Beispiel #12
0
 def _setup_table_without_data(self, table_nm, schema,
                               create_table_col_map):
     with lad.DbHandler('LA_DB') as db:
         db.create_table(table_nm, schema, create_table_col_map)