Exemple #1
0
	def _listen(self):
		if not self.endpoint:
			return ''
		else:
			if is_file(self.endpoint):
				return load_text(self.endpoint, '')
			else:
				return ''
Exemple #2
0
def encrypt_text_file(key, source_file_name, target_file_name=None):
    """Encrypt text file to target file name."""
    if not target_file_name:
        # default target file name is source file name with '_e' added to file extension
        target_file_name = source_file_name + '_e'

    decrypted_data = load_text(source_file_name)
    encrypted_data = encrypt_data(key, decrypted_data)
    save_text(encrypted_data, target_file_name)
Exemple #3
0
def decrypt_text_file(key, source_file_name, target_file_name=None):
    """Decrypt text file to target file name."""
    if not target_file_name:
        # if source file extension ends with '_e', decrypt to source file name minus this suffix
        if source_file_name.lower().endswith('_e'):
            target_file_name = source_file_name[:-2]
        else:
            raise Exception('target_file_name not specified')

    encrypted_data = load_text(source_file_name)
    decrypted_data = decrypt_data(key, encrypted_data)
    save_text(decrypted_data, target_file_name)
Exemple #4
0
def test_encrypt_decrypt_text_file():
    """Test text file encryption/decryption."""

    # setup
    key = 'abc'
    original = 'This is text file data.'
    original_file_name = 'test_encryption_file.txt'
    encrypted_file_name = original_file_name + '_encrypted'
    decrypted_file_name = original_file_name + '_decrypted'
    save_text(original, original_file_name)

    # test
    encrypt_text_file(key, original_file_name, encrypted_file_name)
    encrypted = load_text(encrypted_file_name)
    decrypt_text_file(key, encrypted_file_name, decrypted_file_name)
    decrypted = load_text(decrypted_file_name)
    _output_encrypt_decrypt_results(original, encrypted, decrypted)

    # cleanup
    delete_file(original_file_name)
    delete_file(encrypted_file_name)
    delete_file(decrypted_file_name)
    def load_file(self, file_name, default_section_key=''):
        # reset parse status variables
        self.current_section = None
        self.current_section_key = None
        self.is_stopped = False
        self.file_name = file_name
        self.line_number = 0

        # provide file name context for debug output
        logger.info(f'ConfigSection.load_file({file_name})')

        # load default section if passed in as default_section_key, eg. for ini files without sections
        if default_section_key:
            logger.debug(f'Using default section ({default_section_key})')
            self.current_section_key = self.section_key(default_section_key)
            self.sections[self.current_section_key] = ''

        lines = load_text(file_name, '').splitlines()
        for self.line_number, line in enumerate(lines, 1):
            # exit if we entered a stop condition
            if self.is_stopped:
                break

            # prep line for parsing
            line = compress_whitespace(line, preserve_indent=True)
            line = strip_c_style_comments(line)

            # skip comment lines, but pass blank lines through as data
            if self.is_comment(line) and line:
                continue

            # start a new section
            elif self.is_section(line):
                self.current_section_key = self.section_key(line)
                self.sections[self.current_section_key] = ''

            # add line to section's value
            elif self.current_section_key:
                # TODO: Expand {%expressions%}
                self.sections[self.current_section_key] += '\n' + line

            # non-blank lines without a section name are treated as errors
            elif line:
                self.warning(
                    'Unexpected line outside of defined section; line ignored')

        # strip leading and trailing whitespace from values
        # Note: This does not affect indented values.
        for key in self.sections:
            self.sections[key] = self.sections[key].strip()
    def load_file(self, file_name, default_section_key=''):
        """Load a configuration file. Parse into sections indexed by section_type[:section_name]."""

        # reset parse status variables
        self.current_section = None
        self.current_section_key = None
        self.is_stopped = False
        self.file_name = file_name
        self.line_number = 0

        # provide file name context for debug output
        logger.info(f'ConfigSectionKey.load_file({file_name})')

        # load default section if passed in as default_section_key, eg. for ini files without sections
        if default_section_key:
            logger.info(f'Using default section ({default_section_key})')
            self.load_section(default_section_key)

        lines = load_text(file_name, '').splitlines()
        for self.line_number, line in enumerate(lines, 1):
            # exit if we entered a stop condition
            if self.is_stopped:
                break

            # prep line for parsing
            indentation = get_indentation(line)
            is_indented = len(indentation)
            line = compress_whitespace(line)
            line = strip_c_style_comments(line)

            if is_indented and self.current_section:
                if not self.current_key_name:
                    self.warning(
                        'No current key to append indented line to; line ignored'
                    )
                else:
                    current_value = getattr(self.current_section,
                                            self.current_key_name)
                    new_value = f'{current_value}\n{indentation}{line}'
                    setattr(self.current_section, self.current_key_name,
                            new_value)

            # process section definitions
            elif self.is_section(line):
                self.load_section(line)
                if not self.current_section:
                    self.warning(
                        f'Undefined section ({self.current_section_key})')

            # process comments and blank lines
            elif self.is_comment(line):
                # ignore comment lines and blank lines
                pass

            # process @commands
            elif self.is_command(line):
                self.do_command(line)

            # process key=value assignments
            else:
                key, value = key_value(line)

                # remember current key name
                self.current_key_name = key

                # split keys at '|' to handle keys acting as dicts or lists
                # key|key_id = value (treat key as a dict indexed by key_id)
                # key| = value (treat key as a list)
                key, delimiter, key_id = key.partition('|')

                if not self.current_section:
                    self.warning(
                        f'In undefined section ({self.current_section_key}); line ignored'
                    )
                elif not key:
                    self.warning(
                        f'Syntax error; no key-value assignment ({line})')

                # if section is validated and the key isn't present, then warn
                elif self.current_section.is_validated() and not hasattr(
                        self.current_section, key):
                    self.warning(
                        f'Unknown property ({key}) in section {self.current_section_key}'
                    )
                else:
                    # update the value
                    self.set_key_value(key, key_id, value)
Exemple #7
0
def stage_file(db_conn, archive_objectstore, object_key):

    # make sure work folder exists and is empty
    work_folder = 'stage_work'
    clear_folder(work_folder)
    if not os.path.exists(work_folder):
        os.mkdir(work_folder)

    # get the posted file
    source_file_name = f'{work_folder}/' + just_file_name(object_key)
    logger.info(f'Getting {source_file_name} from archive::{object_key}')
    archive_objectstore.get(source_file_name, object_key)

    # create the file's namespace schema if missing
    namespace = object_key.split('/')[0]
    job_id = object_key
    db_conn.create_schema(namespace)

    # unzip the file
    # shutil.unpack_archive(source_file_name, extract_dir=work_folder)
    file_names = FileList(source_file_name)
    file_names.include('*')
    extract_archive(source_file_name, work_folder, file_names)

    # process all table files in our work folder
    for file_name in sorted(glob.glob(f'{work_folder}/*.table')):
        table_name = pathlib.Path(file_name).stem
        logger.info(f'Processing {table_name} ...')

        # TODO: rename files to use a _table, _schema suffix and .json file extension

        # always load table objects
        # input_stream = open(f'{work_folder}/{table_name}.table', 'rb')
        # table_object = pickle.load(input_stream)
        # input_stream.close()
        table_object = load_json(f'{work_folder}/{table_name}.table')

        # always load table schema
        # input_stream = open(f'{work_folder}/{table_name}.schema', 'rb')
        # table_schema = pickle.load(input_stream)
        # input_stream.close()
        table_schema = load_json(f'{work_folder}/{table_name}.schema')

        # always load table pk
        # input_stream = open(f'{work_folder}/{table_name}.pk')
        # table_pk = input_stream.read().strip()
        # input_stream.close()
        table_pk = load_text(f'{work_folder}/{table_name}.pk').strip()

        # extend table object with table table and column names from table_schema object
        table_object.table_name = table_name
        table_object.column_names = [
            column_name for column_name in table_schema.columns
        ]

        # if drop_table, drop table and exit
        if table_object.drop_table:
            logger.info(f'Table drop request; table_drop=1')
            db_conn.drop_table(namespace, table_name)
            return

        # convert table schema to our target database and add extended column definitions
        extended_definitions = 'udp_jobid int, udp_timestamp datetime2'.split(
            ',')
        convert_to_mssql(table_schema, extended_definitions)

        # 2018-09-12 support custom staging table type overrides
        # [table].table_type = < blank > | standard, columnar, memory, columnar - memory

        # create target table if it doesn't exist
        if not db_conn.does_table_exist(namespace, table_name):
            # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
            logger.info(f'Creating table: {namespace}.{table_name}')
            db_conn.create_table_from_table_schema(namespace, table_name,
                                                   table_schema,
                                                   extended_definitions)

        # handle cdc vs non-cdc table workflows differently
        logger.debug(
            f'{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}'
        )
        if not table_object.cdc or table_object.cdc.lower(
        ) == 'none' or not table_pk:
            # if table cdc=none, drop the target table
            logger.info(f'Table cdc=[{table_object.cdc}]; rebuilding table')
            db_conn.drop_table(namespace, table_name)

            # no cdc in effect for this table - insert directly to target table
            work_folder_obj = pathlib.Path(work_folder)
            batch_number = 0
            for json_file in sorted(
                    work_folder_obj.glob(f'{table_name}#*.json')):
                # load rows from json file
                # input_stream = open(json_file)
                # rows = json.load(input_stream)
                # input_stream.close()
                rows = load_json(json_file)

                # insert/upsert/merge *.json into target tables
                if not rows:
                    logger.info(f'Table {table_name} has 0 rows; no updates')
                else:
                    batch_number += 1
                    logger.info(
                        f'Job {job_id}, batch {batch_number}, table {table_name}'
                    )

                    # convert date/datetime columns to date/datetime values
                    convert_data_types(rows, table_schema)

                    # db_conn.insert_many( namespace, table_name, rows )
                    db_conn.bulk_insert_into_table(namespace, table_name,
                                                   table_schema, rows)

        else:
            # table has cdc updates

            # create temp table to receive captured changes
            # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'.
            temp_table_name = f'_{table_name}'
            db_conn.drop_table(namespace, temp_table_name)

            # print(f'namespace = {namespace}')
            # print(f'temp_table_name = {temp_table_name}')
            # print(f'table_object = {dir(table_object)}')
            # print(f'extended definitions = {extended_definitions}')

            db_conn.create_table_from_table_schema(namespace, temp_table_name,
                                                   table_schema,
                                                   extended_definitions)

            # insert captured updates into temp table
            work_folder_obj = pathlib.Path(work_folder)
            batch_number = 0
            for json_file in sorted(
                    work_folder_obj.glob(f'{table_name}#*.json')):
                # load rows from json file
                # input_stream = open(json_file)
                # rows = json.load(input_stream)
                # input_stream.close()
                rows = load_json(json_file)

                # insert/upsert/merge *.json into target tables
                if not rows:
                    logger.info(f'Table {table_name} has 0 rows; no updates')
                    break
                else:
                    batch_number += 1
                    logger.info(
                        f'Job {job_id}, batch {batch_number}, table {table_name}'
                    )

                    # convert date/datetime columns to date/datetime values
                    convert_data_types(rows, table_schema)

                    # db_conn.insert_many( namespace, table_name, rows )
                    db_conn.bulk_insert_into_table(namespace, temp_table_name,
                                                   table_schema, rows)
            else:
                # merge (upsert) temp table to target table
                merge_cdc = cdc_merge.MergeCDC(table_object,
                                               extended_definitions)
                sql_command = merge_cdc.merge(namespace, table_pk)

                # TODO: Capture SQL commands in a sql specific log.
                logger.debug(sql_command)
                db_conn.cursor.execute(sql_command)

            # drop temp table after merge
            db_conn.drop_table(namespace, temp_table_name)
Exemple #8
0
    def stage_file(self, archive_capture_file_name):
        logger.info(
            f"Getting {archive_capture_file_name} from archive blob store")

        # make sure work folder exists and is empty
        clear_folder(self.work_folder)

        # connect to the archive blobstore
        resource = self.config(self.project.blobstore_archive)
        bs_archive = BlobStore()
        bs_archive.connect(resource)

        # extract dataset name and job id from archive capture file name
        dataset_name, _, job_id = just_file_stem(
            archive_capture_file_name).partition("#")

        # copy archive_capture_file_name to our local working folder
        capture_file_name = just_file_name(archive_capture_file_name)
        local_work_file_name = f"{self.work_folder}/{capture_file_name}"
        archive_capture_file_blob_name = f"{archive_capture_file_name}"
        bs_archive.get(local_work_file_name, archive_capture_file_blob_name)
        bs_archive.disconnect()

        # unzip the capture file we retrieved from archive
        with zipfile.ZipFile(local_work_file_name) as zf:
            zf.extractall(self.work_folder)

        # create the file's dataset_name schema if missing
        self.target_db_conn.create_schema(dataset_name)

        # process all table files in our work folder
        for file_name in sorted(glob.glob(f"{self.work_folder}/*.table")):
            table_name = just_file_stem(file_name)
            logger.info(f"Processing {table_name} ...")

            # always load table objects
            table_object = load_jsonpickle(
                f"{self.work_folder}/{table_name}.table")

            # skip table if no schema file exists
            schema_file_name = f"{self.work_folder}/{table_name}.schema"
            if not is_file(schema_file_name):
                logger.warning(
                    f"Table skipped ({table_name}); schema file not found")
                continue

            # always load table schema
            table_schema = load_jsonpickle(schema_file_name)

            # always load table pk
            table_pk = load_text(f"{self.work_folder}/{table_name}.pk").strip()

            # extend table object with table table and column names from table_schema object
            table_object.table_name = table_name
            table_object.column_names = [
                column_name for column_name in table_schema.columns
            ]

            # if drop_table, drop table and exit
            if table_object.drop_table:
                logger.info(f"Table drop request; table_drop=1")
                self.target_db_conn.drop_table(dataset_name, table_name)
                return

            # convert table schema to our target database and add extended column definitions
            extended_definitions = "udp_jobid int, udp_timestamp datetime2".split(
                ",")
            convert_to_mssql(table_schema, extended_definitions)

            # Future: support custom staging table type overrides
            # [table].table_type = < blank > | standard, columnar, memory, columnar - memory

            # handle cdc vs non-cdc table workflows differently
            logger.debug(
                f"{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}"
            )
            if (not table_object.cdc or table_object.cdc.lower() == "none"
                    or not table_pk):
                # if table cdc=none, drop the target table
                logger.info(
                    f"Table cdc=[{table_object.cdc}]; rebuilding table")
                self.target_db_conn.drop_table(dataset_name, table_name)

                # then re-create target table with latest schema
                # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
                logger.info(
                    f"Re-creating non-CDC table: {dataset_name}.{table_name}")
                self.target_db_conn.create_table_from_table_schema(
                    dataset_name, table_name, table_schema,
                    extended_definitions)

                # no cdc in effect for this table - insert directly to target table
                work_folder_obj = pathlib.Path(self.work_folder)
                batch_number = 0
                for json_file in sorted(
                        work_folder_obj.glob(f"{table_name}#*.json")):
                    # load rows from json file
                    # input_stream = open(json_file)
                    # rows = json.load(input_stream)
                    # input_stream.close()
                    rows = load_jsonpickle(json_file)

                    # insert/upsert/merge *.json into target tables
                    if not rows:
                        logger.info(
                            f"Table {table_name} has 0 rows; no updates")
                    else:
                        batch_number += 1
                        logger.info(
                            f"Job {job_id}, batch {batch_number}, table {table_name}"
                        )
                        self.progress_message(
                            f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..."
                        )

                        # convert date/datetime columns to date/datetime values
                        convert_data_types(rows, table_schema)
                        self.target_db_conn.bulk_insert_into_table(
                            dataset_name, table_name, table_schema, rows)

            else:
                # table has cdc updates

                # create target table if it doesn't exist
                if not self.target_db_conn.does_table_exist(
                        dataset_name, table_name):
                    # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
                    logger.info(f"Creating table: {dataset_name}.{table_name}")
                    self.target_db_conn.create_table_from_table_schema(
                        dataset_name, table_name, table_schema,
                        extended_definitions)

                # create temp table to receive captured changes
                # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'.
                temp_table_name = f"_{table_name}"
                self.target_db_conn.drop_table(dataset_name, temp_table_name)
                self.target_db_conn.create_table_from_table_schema(
                    dataset_name, temp_table_name, table_schema,
                    extended_definitions)

                # insert captured updates into temp table
                work_folder_obj = pathlib.Path(self.work_folder)
                batch_number = 0
                for json_file in sorted(
                        work_folder_obj.glob(f"{table_name}#*.json")):
                    # load rows from json file
                    # input_stream = open(json_file)
                    # rows = json.load(input_stream)
                    # input_stream.close()
                    rows = load_jsonpickle(json_file)

                    # insert/upsert/merge *.json into target tables
                    if not rows:
                        logger.info(
                            f"Table {table_name} has 0 rows; no updates")
                        break
                    else:
                        batch_number += 1
                        logger.info(
                            f"Job {job_id}, batch {batch_number}, table {table_name}"
                        )
                        self.progress_message(
                            f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..."
                        )

                        # convert date/datetime columns to date/datetime values
                        convert_data_types(rows, table_schema)
                        self.target_db_conn.bulk_insert_into_table(
                            dataset_name, temp_table_name, table_schema, rows)
                else:
                    # merge (upsert) temp table to target table
                    merge_cdc = cdc_merge.MergeCDC(table_object,
                                                   extended_definitions)
                    sql_command = merge_cdc.merge(dataset_name, table_pk)

                    # TODO: Capture SQL commands in a sql specific log.
                    logger.debug(sql_command)
                    self.target_db_conn.cursor.execute(sql_command)

                # drop temp table after merge
                self.target_db_conn.drop_table(dataset_name, temp_table_name)