def update_stat_log(self, source_file_name):

        # OLD: get references to stage database and catalog schema
        # data_stage_database = udp.udp_stage_database
        # data_catalog_schema = udp.udp_catalog_schema

        db = database.MSSQL(self.config(self.project.database))

        conn = db.conn
        # cursor = conn.cursor()

        db_conn = database.Database('mssql', conn)
        db_conn.use_database('udp_stage')

        # TODO: Will json_pickle restore datetime values without explict conversion ???
        # TODO: Wrapper for stat insert that does intersection of json and target record's schema
        #       and explicitly inserts specific column/value pairs they have in common; this will
        #       require that our json send includes column names, not just rows of column values !!!!

        # extract job.log/last_job.log from capture zip and merge these into stat_log table
        job_log_data = read_archived_file(source_file_name,
                                          'job.log',
                                          default=None)
        if job_log_data:
            job_log_json = json.loads(job_log_data)
            for row in job_log_json:
                row['start_time'] = iso_to_datetime(row['start_time']).datetime
                row['end_time'] = iso_to_datetime(row['end_time']).datetime

                # skip capture stats which only have intermediate end_time and run_time values
                # next capture file will include an accurate version of this stat in last_job.job file
                if row['stat_name'] != 'capture':
                    db_conn.insert_into_table('udp_catalog', 'stat_log', **row)

        # if 'last_job.log' in archive.namelist():
        job_log_data = read_archived_file(source_file_name,
                                          'last_job.log',
                                          default=None)
        if job_log_data:
            last_job_log_json = json.loads(job_log_data)
            for row in last_job_log_json:
                row['start_time'] = iso_to_datetime(row['start_time']).datetime
                row['end_time'] = iso_to_datetime(row['end_time']).datetime
                if row['stat_name'] in ('capture', 'compress', 'upload'):
                    db_conn.insert_into_table('udp_catalog', 'stat_log', **row)
def test_iso_to_datetime():
    iso_date = '2018-12-01'
    iso_datetime = '2018-11-21T19:53:12+00:00'
    iso_datetime_utc = '2018-11-21T19:53:12Z'
    converted_iso_date = iso_to_datetime(iso_date)
    converted_iso_datetime = iso_to_datetime(iso_datetime)
    converted_iso_datetime_utc = iso_to_datetime(iso_datetime_utc)

    # converted_iso_date assertions
    assert converted_iso_date.year == 2018
    assert converted_iso_date.month == 12
    assert converted_iso_date.day == 1

    # converted_iso_datetime assertions
    assert converted_iso_datetime.year == 2018
    assert converted_iso_datetime.month == 11
    assert converted_iso_datetime.day == 21

    # converted_iso_datetime_utc assertions
    assert converted_iso_datetime_utc.year == 2018
    assert converted_iso_datetime_utc.month == 11
    assert converted_iso_datetime_utc.day == 21
    def process_table(self, db, db_engine, schema_name, table_name,
                      table_object, table_history, current_timestamp):
        """Process a specific table."""

        # skip default table and ignored tables
        if table_name == 'default':
            return
        elif table_object.ignore_table:
            logger.info(f'Skipping table: {table_name} (ignore_table=1)')
            return
        elif table_object.drop_table:
            logger.info(f'Skipping table: {table_name} (drop_table=1)')
            return

        # initialize table history's last time stamp to first timestamp if not set yet
        if not table_history.last_timestamp:
            # default first timestamp to 1900-01-01 if project has no first timestamp
            if not table_object.first_timestamp:
                table_object.first_timestamp = '1900-01-01'
            table_history.last_timestamp = iso_to_datetime(
                table_object.first_timestamp)

        # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future
        if table_history.last_timestamp > current_timestamp:
            explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}'
            logger.info(f'Skipping table: {table_name} ({explanation})')
            return

        # if we're here then we have a legit last timestamp value to use for CDC
        last_timestamp = table_history.last_timestamp

        self.stats.start(table_name, 'table')
        # logger.info(f'Processing {table_name} ...')

        # create a fresh cursor for each table
        cursor = db.conn.cursor()

        # save table object for stage
        output_stream = open(f'{self.work_folder_name}/{table_name}.table',
                             'wb')
        pickle.dump(table_object, output_stream)
        output_stream.close()

        # discover table schema
        table_schema = db_engine.select_table_schema(schema_name, table_name)

        # remove ignored columns from table schema
        if table_object.ignore_columns:
            # find columns to ignore (remove) based on ignore column names/glob-style patterns
            ignore_columns = []
            for column_name in table_schema.columns:
                for pattern in split(table_object.ignore_columns):
                    # use fnmatch() to provide glob style matching
                    if fnmatch.fnmatch(column_name.lower(), pattern.lower()):
                        ignore_columns.append(column_name)

            # delete ignored columns from our table schema
            for column_name in ignore_columns:
                logger.info(f'Ignore_column: {table_name}.{column_name}')
                table_schema.columns.pop(column_name)

        # save table schema for stage to use
        output_stream = open(f'{self.work_folder_name}/{table_name}.schema',
                             'wb')
        pickle.dump(table_schema, output_stream)
        output_stream.close()

        # save table pk for stage to use
        pk_columns = db_engine.select_table_pk(schema_name, table_name)
        if not pk_columns and table_object.primary_key:
            pk_columns = table_object.primary_key
        output_stream = open(f'{self.work_folder_name}/{table_name}.pk', 'w')
        output_stream.write(pk_columns)
        output_stream.close()

        # clear cdc if it doesn't match timestamp/rowversion
        table_object.cdc = table_object.cdc.lower()
        if not table_object.cdc or table_object.cdc not in ('timestamp',
                                                            'rowversion'):
            table_object.cdc = ''

        # if no pk_columns, then clear table cdc
        if not pk_columns:
            if table_object.cdc and table_object.cdc != 'none':
                logger.info(
                    f'Warning: {table_name} cdc={table_object.cdc} but table has no pk column(s)'
                )
                table_object.cdc = 'none'

            # we still keep timestamp because its required for filtering first_timestamp - current_timestamp
            # if table_object.timestamp:
            # 	logger.info(f'Warning: {table_name} timestamp={table_object.timestamp} but table has no pk column(s)')
            # 	table_object.timestamp = ''

        # update table object properties for cdc select build
        column_names = list(table_schema.columns.keys())
        table_object.schema_name = schema_name
        table_object.table_name = table_name
        table_object.column_names = column_names
        select_cdc = cdc_select.SelectCDC(table_object)
        sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp)

        # logger.info(f'Capture SQL:\n{sql}\n')

        # run sql here vs via db_engine.capture_select
        # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp)
        cursor.execute(sql)

        # capture rows in fixed size batches to support unlimited size record counts
        # Note: Batching on capture side allows stage to insert multiple batches in parallel.

        if self.project.batch_size:
            batch_size = int(self.project.batch_size)
            # logger.info(f'Using project specific batch size: {self.project.batch_size}')
        else:
            batch_size = 1_000_000

        batch_number = 0
        row_count = 0
        file_size = 0
        while True:
            batch_number += 1
            rows = cursor.fetchmany(batch_size)
            if not rows:
                break

            logger.info(
                f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}'
            )

            # flatten rows to list of column values
            json_rows = [list(row) for row in rows]
            output_file = f'{self.work_folder_name}/{table_name}#{batch_number:04}.json'
            with open(output_file, 'w') as output_stream:
                # indent=2 for debugging
                json.dump(json_rows,
                          output_stream,
                          indent=2,
                          default=json_serializer)

            # track stats
            row_count += len(json_rows)
            file_size += pathlib.Path(output_file).stat().st_size

        # if no cdc, but order set, do a file hash see if output the same time as last file hash
        if (not table_object.cdc
                or table_object.cdc == 'none') and table_object.order:
            print(
                f'Checking {table_name} file hash based on cdc={table_object.cdc} and order={table_object.order}'
            )
            table_data_files = f'{self.work_folder_name}/{table_name}#*.json'
            current_filehash = hash_files(table_data_files)
            if table_history.last_filehash == current_filehash:
                # suppress this update
                print(
                    f'Table({table_name}): identical file hash, update suppressed'
                )
                logger.info(
                    f'Table({table_name}): identical file hash, update suppressed'
                )
                row_count = 0
                file_size = 0

                # delete exported json files
                delete_files(table_data_files)
            else:
                print(
                    f'Table({table_name}): {table_history.last_filehash} != {current_filehash}'
                )
                table_history.last_filehash = current_filehash

        # update table history with new last timestamp value
        table_history.last_timestamp = current_timestamp

        # track total row count and file size across all of a table's batched json files
        self.stats.stop(table_name, row_count, file_size)

        # save interim state of stats for diagnostics
        self.stats.save()

        self.job_row_count += row_count
        self.job_file_size += file_size

        # explicitly close cursor when finished
        # cursor.close()
        return
Exemple #4
0
    def process_table(self,
                      db,
                      db_engine,
                      schema_name,
                      table_name,
                      table_object,
                      table_history,
                      current_timestamp,
                      current_sequence=0):
        """Process a specific table."""

        # skip default table and ignored tables
        if table_name == 'default':
            return

        # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references)
        elif table_object.ignore_table:
            logger.info(f'Skipping table: {table_name} (ignore_table=1)')
            return
        elif table_object.drop_table:
            logger.info(f'Skipping table: {table_name} (drop_table=1)')
            return

        # initialize table history's last time stamp to first timestamp if not set yet
        if not table_history.last_timestamp:
            # default first timestamp to 1900-01-01 if project has no first timestamp
            if not table_object.first_timestamp:
                table_object.first_timestamp = '1900-01-01'
            table_history.last_timestamp = iso_to_datetime(
                table_object.first_timestamp)

        # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future
        if table_history.last_timestamp > current_timestamp:
            explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}'
            logger.info(f'Skipping table: {table_name} ({explanation})')
            return

        # if we're here then we have a legit last timestamp value to use for CDC
        last_timestamp = table_history.last_timestamp

        # initialize table's last_sequence to first_sequence if not set yet
        if not table_history.last_sequence:
            if not table_object.first_sequence:
                table_object.first_sequence = 0
            table_history.last_sequence = table_object.first_sequence

        self.events.start(table_name, 'table')
        # logger.info(f'Processing {table_name} ...')

        # create a fresh cursor for each table
        cursor = db.conn.cursor()

        # save table object for stage
        table_file_name = f'{self.work_folder}/{table_name}.table'
        save_jsonpickle(table_file_name, table_object)

        # discover table schema
        table_schema = db_engine.select_table_schema(schema_name, table_name)

        # handle non-existent tables
        if table_schema is None:
            if table_object.optional_table:
                logger.info(
                    f'Optional table not found; skipped ({table_name})')
            else:
                logger.warning(f'Table not found; skipped ({table_name})')
            return

        # remove ignored columns from table schema
        if table_object.ignore_columns:
            # find columns to ignore (remove) based on ignore column names/glob-style patterns
            ignore_columns = []
            for column_name in table_schema.columns:
                for pattern in split(table_object.ignore_columns):
                    if is_glob_match(column_name, pattern):
                        ignore_columns.append(column_name)

            # delete ignored columns from our table schema
            for column_name in ignore_columns:
                logger.info(f'Ignore_column: {table_name}.{column_name}')
                table_schema.columns.pop(column_name)

        # save table schema for stage to use
        schema_table_name = f'{self.work_folder}/{table_name}.schema'
        save_jsonpickle(schema_table_name, table_schema)

        # save table pk for stage to use
        pk_columns = db_engine.select_table_pk(schema_name, table_name)
        if not pk_columns and table_object.primary_key:
            pk_columns = table_object.primary_key
        save_text(f'{self.work_folder}/{table_name}.pk', pk_columns)

        # normalize cdc setting
        table_object.cdc = table_object.cdc.lower()
        if table_object.cdc == 'none':
            table_object.cdc = ''

        # clear unknown cdc settings
        if table_object.cdc and table_object.cdc not in (
                'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'):
            logger.warning(
                f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})'
            )
            table_object.cdc = ''

        # clear cdc setting when no pk_columns are present
        # NOTE: filehash cdc does not require pk_columns.
        if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns:
            logger.warning(
                f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})'
            )
            table_object.cdc = ''

        # if no cdc, then clear cdc related attributes
        if not table_object.cdc:
            table_object.filehash = ''
            table_object.rowhash = ''
            table_object.rowversion = ''
            table_object.sequence = ''
            table_object.timestamp = ''

        # update table object properties for cdc select build
        column_names = list(table_schema.columns.keys())
        table_object.schema_name = schema_name
        table_object.table_name = table_name
        table_object.column_names = column_names
        select_cdc = cdc_select.SelectCDC(db_engine, table_object)
        sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp)

        # save generated SQL to work folder for documentation purposes
        sql_file_name = f'{self.work_folder}/{table_name}.sql'
        save_text(sql_file_name, sql)

        # run sql here vs via db_engine.capture_select
        # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp)
        cursor.execute(sql)

        # capture rows in fixed size batches to support unlimited size record counts
        # Note: Batching on capture side allows stage to insert multiple batches in parallel.

        if self.project.batch_size:
            batch_size = int(self.project.batch_size)
            # logger.info(f'Using project specific batch size: {self.project.batch_size}')
        else:
            batch_size = 250_000

        batch_number = 0
        row_count = 0
        data_size = 0
        while True:
            batch_number += 1
            rows = cursor.fetchmany(batch_size)
            if not rows:
                break

            logger.info(
                f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}'
            )
            self.progress_message(
                f'extracting({table_name}.{batch_number:04}) ...')

            # flatten rows to list of column values
            json_rows = [list(row) for row in rows]
            output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json'
            save_jsonpickle(output_file, json_rows)

            # track metrics
            row_count += len(json_rows)
            data_size += file_size(output_file)

        # update table history with new last timestamp and sequence values
        table_history.last_timestamp = current_timestamp
        table_history.last_sequence = current_sequence

        # track total row count and file size across all of a table's batched json files
        self.events.stop(table_name, row_count, data_size)

        # save interim metrics for diagnostics
        self.events.save()

        self.job_row_count += row_count
        self.job_data_size += data_size

        # explicitly close cursor when finished
        # cursor.close()
        return