Exemple #1
0
def stage_archive_file(archive_object_store, notification):

    logger.info(f'Archive object store update notification: {notification}')

    # TODO: Add activity_log (vs job_log/stat_log) references.

    # get file key and name
    work_folder = 'stage_work'
    object_key = notification.object_key
    archive_file_name = f'{work_folder}/' + just_file_name(object_key)

    logger.info(
        f'Getting {archive_file_name} from archive_object_store::{object_key}')
    archive_object_store.get(archive_file_name, object_key)

    # TODO: Insert stage file code block here
    # unzip archive_file_name
    # load schema info so we can build target tables if missing
    # loop through all <table>.json files
    # conditional: cdc = timestamp vs none vs ...
    # note: if no nk/pk then cdc must be none - override and log warning
    # load json data
    # create target table if missing with capture (udp_jobid, udp_jobpk, udp_timestamp) and stage (udp_pk, udp_nk)
    # create #source table
    # insert into #source via insertmany()
    # update #source with udp_nk using pk from capture
    # merge #source to target on udp_nk
    # update job_log, table_log with stats

    # return name of processed file
    archive_file_name = object_key.rsplit('-', 1) + '/' + object_key
    return archive_file_name
    def archive_capture_file(self, notification):

        # get name of file (key) that triggered this call
        source_object_key = notification.object_key

        # extract out the file name
        source_file_name = just_file_name(source_object_key)

        # if source_file_name is empty, ignore notification
        if not source_file_name:
            logger.debug(
                f'Ignoring notification without object key (file name): {notification}'
            )
            return

        # if source_file_name is capture_state.zip, ignore it
        # Note: This keeps the latest capture_state.zip file in each capture folder for recovery purposes.
        if source_file_name == 'capture_state.zip':
            logger.debug(f'Ignoring capture_state.zip file notification')
            return

        # TODO: Add activity_log (vs job_log/stat_log) references.

        # make sure work folder exists and is empty
        work_folder = 'sessions/archive_work'
        clear_folder(work_folder)

        # get file, copy to archive then delete from capture
        source_objectstore_name = notification.objectstore_name
        source_file_name = f'{work_folder}/' + source_file_name

        # get the posted file
        logger.info(
            f'Getting {source_file_name} from {source_objectstore_name}::{source_object_key}'
        )
        source_objectstore = Objectstore(source_objectstore_name, self.cloud)
        source_objectstore.get(source_file_name, source_object_key)

        # move (copy) the posted file to the archive object_store
        logger.info(
            f'Moving {source_file_name} to archive_object_store::{source_object_key}'
        )
        archive_objectstore_name = self.cloud.archive_objectstore
        archive_objectstore = Objectstore(archive_objectstore_name, self.cloud)
        archive_objectstore.put(source_file_name, source_object_key)

        # then delete file from source object_store	and local work folder
        logger.info(
            f'Deleting {source_object_key} from {source_objectstore_name}')
        source_objectstore.delete(source_object_key)
        delete_file(source_file_name)

        # TODO: update stat_log
        # TODO: update stage queue

        return
Exemple #3
0
    def upload_to_blobstore(self):
        """Upload publish_folder's <dataset_name>-<job_id>.zip to landing blobstore."""

        # don't upload captured data if we're in --notransfer mode
        if self.option('notransfer'):
            logger.warning(
                'Not uploading data to landing per --notransfer option')
            return

        # upload capture file to landing blobstore
        self.events.start('upload', 'step')
        resource = self.config(self.project.blobstore_landing)
        bs_landing = BlobStore()
        bs_landing.connect(resource)
        bs_landing.put(self.zip_file_name, just_file_name(self.zip_file_name))
        bs_landing.disconnect()

        # finish
        self.events.stop('upload', 0, file_size(self.zip_file_name))
Exemple #4
0
def stage_file(db_conn, archive_objectstore, object_key):

    # make sure work folder exists and is empty
    work_folder = 'stage_work'
    clear_folder(work_folder)
    if not os.path.exists(work_folder):
        os.mkdir(work_folder)

    # get the posted file
    source_file_name = f'{work_folder}/' + just_file_name(object_key)
    logger.info(f'Getting {source_file_name} from archive::{object_key}')
    archive_objectstore.get(source_file_name, object_key)

    # create the file's namespace schema if missing
    namespace = object_key.split('/')[0]
    job_id = object_key
    db_conn.create_schema(namespace)

    # unzip the file
    # shutil.unpack_archive(source_file_name, extract_dir=work_folder)
    file_names = FileList(source_file_name)
    file_names.include('*')
    extract_archive(source_file_name, work_folder, file_names)

    # process all table files in our work folder
    for file_name in sorted(glob.glob(f'{work_folder}/*.table')):
        table_name = pathlib.Path(file_name).stem
        logger.info(f'Processing {table_name} ...')

        # TODO: rename files to use a _table, _schema suffix and .json file extension

        # always load table objects
        # input_stream = open(f'{work_folder}/{table_name}.table', 'rb')
        # table_object = pickle.load(input_stream)
        # input_stream.close()
        table_object = load_json(f'{work_folder}/{table_name}.table')

        # always load table schema
        # input_stream = open(f'{work_folder}/{table_name}.schema', 'rb')
        # table_schema = pickle.load(input_stream)
        # input_stream.close()
        table_schema = load_json(f'{work_folder}/{table_name}.schema')

        # always load table pk
        # input_stream = open(f'{work_folder}/{table_name}.pk')
        # table_pk = input_stream.read().strip()
        # input_stream.close()
        table_pk = load_text(f'{work_folder}/{table_name}.pk').strip()

        # extend table object with table table and column names from table_schema object
        table_object.table_name = table_name
        table_object.column_names = [
            column_name for column_name in table_schema.columns
        ]

        # if drop_table, drop table and exit
        if table_object.drop_table:
            logger.info(f'Table drop request; table_drop=1')
            db_conn.drop_table(namespace, table_name)
            return

        # convert table schema to our target database and add extended column definitions
        extended_definitions = 'udp_jobid int, udp_timestamp datetime2'.split(
            ',')
        convert_to_mssql(table_schema, extended_definitions)

        # 2018-09-12 support custom staging table type overrides
        # [table].table_type = < blank > | standard, columnar, memory, columnar - memory

        # create target table if it doesn't exist
        if not db_conn.does_table_exist(namespace, table_name):
            # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
            logger.info(f'Creating table: {namespace}.{table_name}')
            db_conn.create_table_from_table_schema(namespace, table_name,
                                                   table_schema,
                                                   extended_definitions)

        # handle cdc vs non-cdc table workflows differently
        logger.debug(
            f'{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}'
        )
        if not table_object.cdc or table_object.cdc.lower(
        ) == 'none' or not table_pk:
            # if table cdc=none, drop the target table
            logger.info(f'Table cdc=[{table_object.cdc}]; rebuilding table')
            db_conn.drop_table(namespace, table_name)

            # no cdc in effect for this table - insert directly to target table
            work_folder_obj = pathlib.Path(work_folder)
            batch_number = 0
            for json_file in sorted(
                    work_folder_obj.glob(f'{table_name}#*.json')):
                # load rows from json file
                # input_stream = open(json_file)
                # rows = json.load(input_stream)
                # input_stream.close()
                rows = load_json(json_file)

                # insert/upsert/merge *.json into target tables
                if not rows:
                    logger.info(f'Table {table_name} has 0 rows; no updates')
                else:
                    batch_number += 1
                    logger.info(
                        f'Job {job_id}, batch {batch_number}, table {table_name}'
                    )

                    # convert date/datetime columns to date/datetime values
                    convert_data_types(rows, table_schema)

                    # db_conn.insert_many( namespace, table_name, rows )
                    db_conn.bulk_insert_into_table(namespace, table_name,
                                                   table_schema, rows)

        else:
            # table has cdc updates

            # create temp table to receive captured changes
            # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'.
            temp_table_name = f'_{table_name}'
            db_conn.drop_table(namespace, temp_table_name)

            # print(f'namespace = {namespace}')
            # print(f'temp_table_name = {temp_table_name}')
            # print(f'table_object = {dir(table_object)}')
            # print(f'extended definitions = {extended_definitions}')

            db_conn.create_table_from_table_schema(namespace, temp_table_name,
                                                   table_schema,
                                                   extended_definitions)

            # insert captured updates into temp table
            work_folder_obj = pathlib.Path(work_folder)
            batch_number = 0
            for json_file in sorted(
                    work_folder_obj.glob(f'{table_name}#*.json')):
                # load rows from json file
                # input_stream = open(json_file)
                # rows = json.load(input_stream)
                # input_stream.close()
                rows = load_json(json_file)

                # insert/upsert/merge *.json into target tables
                if not rows:
                    logger.info(f'Table {table_name} has 0 rows; no updates')
                    break
                else:
                    batch_number += 1
                    logger.info(
                        f'Job {job_id}, batch {batch_number}, table {table_name}'
                    )

                    # convert date/datetime columns to date/datetime values
                    convert_data_types(rows, table_schema)

                    # db_conn.insert_many( namespace, table_name, rows )
                    db_conn.bulk_insert_into_table(namespace, temp_table_name,
                                                   table_schema, rows)
            else:
                # merge (upsert) temp table to target table
                merge_cdc = cdc_merge.MergeCDC(table_object,
                                               extended_definitions)
                sql_command = merge_cdc.merge(namespace, table_pk)

                # TODO: Capture SQL commands in a sql specific log.
                logger.debug(sql_command)
                db_conn.cursor.execute(sql_command)

            # drop temp table after merge
            db_conn.drop_table(namespace, temp_table_name)
Exemple #5
0
    def stage_file(self, archive_capture_file_name):
        logger.info(
            f"Getting {archive_capture_file_name} from archive blob store")

        # make sure work folder exists and is empty
        clear_folder(self.work_folder)

        # connect to the archive blobstore
        resource = self.config(self.project.blobstore_archive)
        bs_archive = BlobStore()
        bs_archive.connect(resource)

        # extract dataset name and job id from archive capture file name
        dataset_name, _, job_id = just_file_stem(
            archive_capture_file_name).partition("#")

        # copy archive_capture_file_name to our local working folder
        capture_file_name = just_file_name(archive_capture_file_name)
        local_work_file_name = f"{self.work_folder}/{capture_file_name}"
        archive_capture_file_blob_name = f"{archive_capture_file_name}"
        bs_archive.get(local_work_file_name, archive_capture_file_blob_name)
        bs_archive.disconnect()

        # unzip the capture file we retrieved from archive
        with zipfile.ZipFile(local_work_file_name) as zf:
            zf.extractall(self.work_folder)

        # create the file's dataset_name schema if missing
        self.target_db_conn.create_schema(dataset_name)

        # process all table files in our work folder
        for file_name in sorted(glob.glob(f"{self.work_folder}/*.table")):
            table_name = just_file_stem(file_name)
            logger.info(f"Processing {table_name} ...")

            # always load table objects
            table_object = load_jsonpickle(
                f"{self.work_folder}/{table_name}.table")

            # skip table if no schema file exists
            schema_file_name = f"{self.work_folder}/{table_name}.schema"
            if not is_file(schema_file_name):
                logger.warning(
                    f"Table skipped ({table_name}); schema file not found")
                continue

            # always load table schema
            table_schema = load_jsonpickle(schema_file_name)

            # always load table pk
            table_pk = load_text(f"{self.work_folder}/{table_name}.pk").strip()

            # extend table object with table table and column names from table_schema object
            table_object.table_name = table_name
            table_object.column_names = [
                column_name for column_name in table_schema.columns
            ]

            # if drop_table, drop table and exit
            if table_object.drop_table:
                logger.info(f"Table drop request; table_drop=1")
                self.target_db_conn.drop_table(dataset_name, table_name)
                return

            # convert table schema to our target database and add extended column definitions
            extended_definitions = "udp_jobid int, udp_timestamp datetime2".split(
                ",")
            convert_to_mssql(table_schema, extended_definitions)

            # Future: support custom staging table type overrides
            # [table].table_type = < blank > | standard, columnar, memory, columnar - memory

            # handle cdc vs non-cdc table workflows differently
            logger.debug(
                f"{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}"
            )
            if (not table_object.cdc or table_object.cdc.lower() == "none"
                    or not table_pk):
                # if table cdc=none, drop the target table
                logger.info(
                    f"Table cdc=[{table_object.cdc}]; rebuilding table")
                self.target_db_conn.drop_table(dataset_name, table_name)

                # then re-create target table with latest schema
                # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
                logger.info(
                    f"Re-creating non-CDC table: {dataset_name}.{table_name}")
                self.target_db_conn.create_table_from_table_schema(
                    dataset_name, table_name, table_schema,
                    extended_definitions)

                # no cdc in effect for this table - insert directly to target table
                work_folder_obj = pathlib.Path(self.work_folder)
                batch_number = 0
                for json_file in sorted(
                        work_folder_obj.glob(f"{table_name}#*.json")):
                    # load rows from json file
                    # input_stream = open(json_file)
                    # rows = json.load(input_stream)
                    # input_stream.close()
                    rows = load_jsonpickle(json_file)

                    # insert/upsert/merge *.json into target tables
                    if not rows:
                        logger.info(
                            f"Table {table_name} has 0 rows; no updates")
                    else:
                        batch_number += 1
                        logger.info(
                            f"Job {job_id}, batch {batch_number}, table {table_name}"
                        )
                        self.progress_message(
                            f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..."
                        )

                        # convert date/datetime columns to date/datetime values
                        convert_data_types(rows, table_schema)
                        self.target_db_conn.bulk_insert_into_table(
                            dataset_name, table_name, table_schema, rows)

            else:
                # table has cdc updates

                # create target table if it doesn't exist
                if not self.target_db_conn.does_table_exist(
                        dataset_name, table_name):
                    # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
                    logger.info(f"Creating table: {dataset_name}.{table_name}")
                    self.target_db_conn.create_table_from_table_schema(
                        dataset_name, table_name, table_schema,
                        extended_definitions)

                # create temp table to receive captured changes
                # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'.
                temp_table_name = f"_{table_name}"
                self.target_db_conn.drop_table(dataset_name, temp_table_name)
                self.target_db_conn.create_table_from_table_schema(
                    dataset_name, temp_table_name, table_schema,
                    extended_definitions)

                # insert captured updates into temp table
                work_folder_obj = pathlib.Path(self.work_folder)
                batch_number = 0
                for json_file in sorted(
                        work_folder_obj.glob(f"{table_name}#*.json")):
                    # load rows from json file
                    # input_stream = open(json_file)
                    # rows = json.load(input_stream)
                    # input_stream.close()
                    rows = load_jsonpickle(json_file)

                    # insert/upsert/merge *.json into target tables
                    if not rows:
                        logger.info(
                            f"Table {table_name} has 0 rows; no updates")
                        break
                    else:
                        batch_number += 1
                        logger.info(
                            f"Job {job_id}, batch {batch_number}, table {table_name}"
                        )
                        self.progress_message(
                            f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..."
                        )

                        # convert date/datetime columns to date/datetime values
                        convert_data_types(rows, table_schema)
                        self.target_db_conn.bulk_insert_into_table(
                            dataset_name, temp_table_name, table_schema, rows)
                else:
                    # merge (upsert) temp table to target table
                    merge_cdc = cdc_merge.MergeCDC(table_object,
                                                   extended_definitions)
                    sql_command = merge_cdc.merge(dataset_name, table_pk)

                    # TODO: Capture SQL commands in a sql specific log.
                    logger.debug(sql_command)
                    self.target_db_conn.cursor.execute(sql_command)

                # drop temp table after merge
                self.target_db_conn.drop_table(dataset_name, temp_table_name)