Exemple #1
0
    def run(self, config) -> dict:
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param   = config['input_param',  'data']
        xslt_param    = config['xslt_param',   'xslt']
        output_param  = config['output_param', 'data']
        pretty_output = config['pretty_output', False]

        # Read and parse the input and XSLT data
        data_dom = lxml.etree.fromstring(config[input_param].encode())
        xslt_dom = lxml.etree.fromstring(config[xslt_param].encode())

        # Are we producing plain text output?
        as_plain_text = 'text' == xslt_dom.xpath(
            'string(/xsl:stylesheet/xsl:output/@method)', namespaces={'xsl': 'http://www.w3.org/1999/XSL/Transform'})

        # Construct and apply the transformation
        transform = lxml.etree.XSLT(xslt_dom)
        result_dom = transform(data_dom)

        # Convert the transformation result into string
        if as_plain_text:
            result = str(result_dom)
        else:
            result = lxml.etree.tostring(result_dom, encoding=str, pretty_print=pretty_output)
            if result is None:
                result = ''
        logger.log("Done, XSLT transformation result is {} chars".format(len(result)))

        # Return the result
        return {output_param: result}
Exemple #2
0
    def run(self, config) -> dict:
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param = config['input_param', 'data']
        output_param = config['output_param', 'data']
        data = config[input_param]
        rules = config['rules']
        size_in = len(data)

        # Iterate through rules
        for rule in rules:
            search = rule['search']
            replace = rule['replace']
            count = int(rule['count', 0])
            is_regex = bool(rule['is_regex', False])
            if is_regex:
                data = re.sub(search, replace, data, count)
            else:
                data = data.replace(search, replace,
                                    count if count > 0 else -1)

        logger.log('Done. Input: {} bytes, output: {} bytes.'.format(
            size_in, len(data)))

        # Return the result
        return {output_param: data}
Exemple #3
0
    def run(self, config) -> dict:
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param = config['input_param', 'data']
        output_param = config['output_param', 'data']
        data = config[input_param]
        regex = config['regex']
        group = int(config['group_num'])
        unique = bool(config['unique', False])

        # Collect all matches
        matches = [match.group(group) for match in re.finditer(regex, data)]

        # Deduplicate matches if needed, keeping the order
        if unique:
            unique_matches = []
            for m in matches:
                if m not in unique_matches:
                    unique_matches.append(m)
            matches = unique_matches

        logger.log("Found {}{} match(es)".format(len(matches),
                                                 ' unique' if unique else ''))

        # Return the result
        return {output_param: '\n'.join(matches) + '\n'}
Exemple #4
0
    def run(self, config):
        """Override the abstract method of the base class."""
        # Fetch the config parameters
        input_param = config['input_param', 'data']
        output_param = config['output_param', 'data']
        passthrough_params = config['passthrough_params', None]
        chomp = bool(config['chomp', True])
        skip_blank_lines = bool(config['skip_blank_lines', True])

        # Prepare child params
        if passthrough_params is None:
            sub_params = {}
        else:
            sub_params = {k: config[k] for k in passthrough_params}

        # Read the input data
        line_num = 0
        for line in config.lines(input_param):
            # Strip the linebreak if needed
            if chomp:
                line = line.rstrip('\r\n')

            # Invoke the handler
            if not skip_blank_lines or len(line) > 0:
                line_num += 1
                logger.log('Processing line #{}'.format(line_num))
                sub_params[output_param] = line
                context.invoke_handler(config['handler'], sub_params)
Exemple #5
0
def _check_logger_file(severity):
    """Output a specific message via logger to a file and check its result."""
    global TEST_MSG
    # Get a temp file name
    temp_handle, temp_file_name = tempfile.mkstemp()
    print('Temporary file used: ' + temp_file_name)
    try:
        # Close it right away
        os.close(temp_handle)

        # Log to the file and close it
        logger.set_log_file(temp_file_name)
        try:
            logger.log(TEST_MSG, severity)
        finally:
            logger.set_log_file(None)

        # The file must contain the original message somewhere
        with open(temp_file_name) as f:
            captured = f.read()
        print('Logger output: "' + captured + '"')
        assert TEST_MSG in captured

    finally:
        # Remove the file
        os.remove(temp_file_name)
Exemple #6
0
    def run(self, config) -> dict:
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param = config['input_param', 'data']
        output_param = config['output_param', 'data']
        src_data = config.lines(input_param)
        start_line = int(config['start_line', 1])
        merge_cnt = int(config['num_lines_to_merge'])
        trim_lines = bool(config['trim_lines', False])
        skip_blank_lines = bool(config['skip_blank_lines', False])
        delimiter = config['delimiter', '']

        # Iterate through data lines
        tgt_data = io.StringIO()
        count_src_lines = 0
        count_tgt_lines = 0
        count_merged = 0
        merged_line = ''
        for src_line in src_data:
            # Skip up to start_line
            count_src_lines += 1
            if count_src_lines < start_line:
                continue

            # Strip all whitespace if needed, otherwise only the terminating linebreak
            line = src_line.strip() if trim_lines else src_line.rstrip('\r\n')

            # If the line is blank and we're skipping them
            if skip_blank_lines and line == '':
                continue

            # Merge lines
            count_merged += 1
            if count_merged > 1:
                merged_line += delimiter
            merged_line += line

            # If merge threshold is reached
            if count_merged == merge_cnt:
                tgt_data.write(merged_line + '\n')
                count_tgt_lines += 1
                count_merged = 0
                merged_line = ''

        # Flush the possible remaining merged lines
        if count_merged > 0:
            tgt_data.write(merged_line + '\n')
            count_tgt_lines += 1

        logger.log('Done. Input: {} lines, output: {} lines.'.format(
            count_src_lines, count_tgt_lines))

        # Return the result
        return {output_param: tgt_data.getvalue()}
Exemple #7
0
    def run(self, config) -> dict:
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param  = config['input_param',  'data']
        output_param = config['output_param', 'data']

        # Read the input data
        lines = sorted(config.lines_list(input_param), reverse=bool(config['reverse', False]))
        logger.log("Sorted {} lines".format(len(lines)))

        # Return the result
        return {output_param: '\n'.join(lines) + '\n'}
Exemple #8
0
    def run(self, config):
        """Override the abstract method of the base class."""
        # Get attributes from config
        output_param = config['output_param', 'data']
        db_name = config['database']
        fld_delimiter = config['field_delimiter', '']
        rec_delimiter = config['record_delimiter', '\n']
        col_headers = config['col_headers', False]
        quotechar = config['quotechar', None]
        sql = config['sql']
        params = config['params', None]

        # Substitute params in the DB connection and find it
        db_name = db_name.format(**config)
        db_conn = context.get_db_connection(db_name)

        # Prepare quote strings
        qopen = self.QUOTE_OPEN[
            quotechar] if quotechar in self.QUOTE_OPEN else quotechar
        qclose = self.QUOTE_CLOSE[
            quotechar] if quotechar in self.QUOTE_CLOSE else quotechar

        # Prepare parameters, if needed
        db_params = {}
        if params is not None:
            for param in params:
                p_name = param['name']
                p_value = param['value']
                db_params[p_name] = p_value.format(**config)

        # Execute the query
        cur = db_conn.cursor()
        records = []
        try:
            cur.execute(sql, db_params)

            # Output column headers, if needed
            if col_headers:
                records.append(
                    fld_delimiter.join(
                        [qopen + d[0] + qclose for d in cur.description]))

            # Read the returned data
            for record in cur:
                records.append(
                    fld_delimiter.join(
                        [qopen + str(v) + qclose for v in record]))
        finally:
            cur.close()

        logger.log('Done. Read {} rows from {}'.format(len(records), db_name))
        return {output_param: rec_delimiter.join(records) + rec_delimiter}
Exemple #9
0
    def run(self, config):
        """Override the abstract method of the base class."""
        # Get attributes from config
        db_name     = config['database']
        sql         = config['sql']
        params      = config['params', None]
        commit_stmt = config['commit_stmt', 'each']

        if commit_stmt == 'none':
            commit_policy = self.COMMIT_NONE
        elif commit_stmt == 'each':
            commit_policy = self.COMMIT_EACH
        elif commit_stmt == 'all':
            commit_policy = self.COMMIT_ALL
        else:
            raise errors.ConfigError('Invalid value for commit_stmt: "{}"'.format(commit_stmt))

        # Substitute params in the DB connection and find it
        db_name = db_name.format(**config)
        db_conn = context.get_db_connection(db_name)

        # Initiate a transaction, if all statements are committed at once
        if not context.dry_run_mode and commit_policy == self.COMMIT_ALL:
            db_conn.begin()

        # Prepare statement(s). If a single statement is given, transform it into a single-element list
        if type(sql) is not list:
            sql = [sql]

        # Prepare parameters, if needed
        db_params = {}
        if params is not None:
            for param in params:
                p_name  = param['name']
                p_value = param['value']
                db_params[p_name] = p_value.format(**config)

        # Execute the statement(s)
        if not context.dry_run_mode:
            for stmt in sql:
                db_conn.execute(stmt, db_params)
                # Commit if needed
                if commit_policy == self.COMMIT_EACH:
                    db_conn.commit()

        # Execute a commit, if all statements are committed at once
        if not context.dry_run_mode and commit_policy == self.COMMIT_ALL:
            db_conn.commit()

        logger.log(context.dry_run_prefix + 'Done. {} statement(s) executed on {}'.format(len(sql), db_name))
Exemple #10
0
    def fetch(url: str, required: bool, verify_cert: bool,
              detect_compressed: bool, username: str, pwd: str,
              encoding: str) -> str:
        """Fetch a file at the specified URL from the server, decompress if necessary and return its contents.
        :param url: URL to fetch.
        :param required: Whether to raise an exception on HTTP 404 error.
        :param verify_cert: Whether to enforce SSL certificate check.
        :param detect_compressed: Whether to detect and decompress compressed files.
        :param username: Username to use for authentication. If None, authentication is not used.
        :param pwd: Password to use for authentication. Ignored if username is None.
        :param encoding: Encoding to use when decoding the HTTP data.
        :return: File contents as text.
        """
        data = utils.http_fetch(url, required, verify_cert, username, pwd)
        if data is None:
            logger.log('File {} doesn\'t exist')
            return None
        else:
            size_in = len(data)
            logger.log('Downloaded {} ({} bytes)'.format(url, size_in))

            # Check for gzip header if needed, and decompress is it's there
            if detect_compressed and size_in > 10 and data[0] == 0x1F and data[
                    1] == 0x8B:
                compressed_file = BytesIO(data)
                compressed_file.seek(0)
                decompressed_file = GzipFile(fileobj=compressed_file)
                data = decompressed_file.read()
                logger.log(
                    'Decompressed the file, raw size is {} bytes'.format(
                        len(data)))

            # Convert the binary data into text
            return data.decode(encoding)
Exemple #11
0
    def run(self, config) -> dict:
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param      = config['input_param',    'data']
        output_param     = config['output_param',   'data']
        rejected_param   = config['rejected_param', None]
        start_line       = int(config['start_line', 1])
        skip_blank_lines = bool(config['skip_blank_lines', False])
        criteria         = config['criteria', None]

        # Convert a single criterion to a one-item list
        if criteria is not None and type(criteria) is not list:
            criteria = [criteria]

        # Process the input lines
        count_src_lines = 0
        tgt_lines = []
        rej_lines = []
        for line in config.lines_list(input_param):
            # Skip up to start_line
            count_src_lines += 1
            if count_src_lines < start_line:
                continue

            # Skip blank lines
            if skip_blank_lines and len(line) == 0:
                continue

            # Process criteria, if any
            do_include = True
            if criteria is not None:
                for criterion in criteria:
                    cr_search            = criterion['search']
                    cr_is_regex          = bool(criterion['is_regex',          False])
                    cr_substitute_params = bool(criterion['substitute_params', False])
                    cr_negate            = bool(criterion['negate',            False])
                    # Substitute params if needed
                    if cr_substitute_params:
                        cr_search = cr_search.format(**config)
                    # Run the matching
                    match = re.search(cr_search, line) is not None if cr_is_regex else cr_search in line
                    # If the matching failed (it's an XOR condition effectively)
                    if cr_negate == match:
                        do_include = False
                        break

            # Add an output line, if needed
            if do_include:
                tgt_lines.append(line)

            # Otherwise accumulate rejected line, if needed
            elif rejected_param is not None:
                rej_lines.append(line)

        # Log the stats and prepare the result
        result = {output_param: '\n'.join(tgt_lines) + '\n'}
        if rejected_param is None:
            logger.log("Read {} input lines, kept {} lines".format(count_src_lines, len(tgt_lines)))
        else:
            logger.log(
                "Read {} input lines, kept {}, rejected {} lines".format(
                    count_src_lines, len(tgt_lines), len(rej_lines)))
            result[rejected_param] = '\n'.join(rej_lines) + '\n'

        # Return the result
        return result
Exemple #12
0
    def run(self, config):
        """Override the abstract method of the base class."""
        # Get attributes from config
        input_param = config['input_param', 'data']
        data = config.lines(input_param)
        delimiter = None
        quotechar = None

        # Get and validate data format
        data_format = config['format']
        fmt_fixed = data_format == 'fixed'
        fmt_delimited = data_format == 'delimited'
        if not fmt_fixed and not fmt_delimited:
            raise errors.ConfigError(
                'Invalid format value: "{}".'.format(data_format))
        if fmt_delimited:
            delimiter = config['delimiter']
            quotechar = config['quotechar', None]

        # Fetch other config attributes
        start_line = int(config['start_line', 1])
        target_database = config['target_database']
        target_table = config['target_table']
        truncate_target = bool(config['truncate_target', False])
        column_mappings = config['column_mappings']

        # Substitute params in the DB connection
        target_database = target_database.format(**config)
        logger.log(
            context.dry_run_prefix +
            'Loading data to {}@{}'.format(target_table, target_database))

        # Find the DB connection
        db_conn = context.get_db_connection(target_database)

        # Truncate the target table, if required
        if truncate_target:
            if not context.dry_run_mode:
                db_conn.execute(
                    'truncate table {} drop storage'.format(target_table))
            logger.log(context.dry_run_prefix +
                       'Table {}@{} is truncated'.format(
                           target_table, target_database))

        # Create an inserter
        inserter = self.get_inserter(db_conn, target_table, column_mappings,
                                     fmt_fixed, fmt_delimited, config)

        # Fixed-width file: use the input data as is
        if fmt_fixed:
            input_data = data

        # Delimited file: open the input data as a CSV file
        else:
            args = {'delimiter': delimiter, 'strict': True}
            if quotechar is not None:
                args['quotechar'] = quotechar
            input_data = csv.reader(data, **args)

        # Iterate through the data
        count_src_lines = 0
        count_tgt_rows = 0
        for src_row in input_data:
            # Skip up to start_line
            count_src_lines += 1
            if count_src_lines < start_line:
                continue

            # Fetch data values
            try:
                target_row = []
                for cm in column_mappings:
                    # If a source value is used
                    if (fmt_fixed and 'source_pos'
                            in cm) or (fmt_delimited and 'source_index' in cm):
                        col_name = cm['name']
                        datatype = cm['datatype']
                        trim = cm['source_trim', 'none']

                        # Fixed-width
                        if fmt_fixed:
                            pos = cm['source_pos'].partition(':')
                            # Validate the position format
                            if pos[0] == '' or pos[2] == '':
                                raise errors.ConfigError(
                                    'Invalid position specifier "{}" for column "{}"'
                                    .format(cm['source_pos'], col_name))
                            # Validate left boundary
                            try:
                                pos_l = int(pos[0]) - 1
                            except ValueError as e:
                                raise errors.ConfigError(
                                    'Left boundary specification for column "{}": {}'
                                    .format(col_name, str(e)))
                            if pos_l < 0:
                                raise errors.ConfigError(
                                    'Left boundary must be positive (column "{}")'
                                    .format(col_name))
                            # Validate right boundary
                            try:
                                pos_r = int(pos[2])
                            except ValueError as e:
                                raise errors.ConfigError(
                                    'Right boundary specification for column "{}": {}'
                                    .format(col_name, str(e)))
                            if pos_r <= pos_l:
                                raise errors.ConfigError(
                                    'Right boundary must be greater than or equal to the left one (column "{}")'
                                    .format(col_name))
                            # Chomp
                            src_row = src_row.rstrip('\r\n')
                            # Extract column value
                            value = src_row[pos_l:pos_r]

                        # Delimited
                        else:
                            value = src_row[int(cm['source_index'])]

                        # Apply trimming
                        if trim == 'none':
                            pass
                        elif trim == 'left':
                            value = value.lstrip()
                        elif trim == 'right':
                            value = value.rstrip()
                        elif trim == 'both':
                            value = value.strip()
                        else:
                            raise errors.ConfigError(
                                'Invalid trim value for column "{}": "{}"'.
                                format(col_name, trim))

                        # All types: handle null values
                        val_len = len(value)
                        if val_len == 0:
                            value = None
                        # String: validate value length
                        elif datatype == 'string':
                            max_len = int(cm['length'])
                            # If the length exceeds the allowed size
                            if val_len > max_len:
                                # No truncation - raise an error
                                if not bool(cm['truncate', False]):
                                    raise errors.DataError(
                                        'String column "{}": value length ({}) exceeds allowed maximum ({})'
                                        .format(col_name, val_len, max_len))
                                # Otherwise truncate the value
                                value = value[:max_len]
                        # Integer: convert to actual int
                        elif datatype == 'integer':
                            try:
                                value = int(value)
                            except ValueError as e:
                                raise errors.DataError(
                                    'Integer column "{}": {}'.format(
                                        col_name, str(e)))
                        # Number: convert to actual number
                        elif datatype == 'number':
                            try:
                                value = float(value)
                            except ValueError as e:
                                raise errors.DataError(
                                    'Float value error for column "{}": {}'.
                                    format(col_name, str(e)))

                        # Add the value to the target row
                        target_row.append(value)

                    # Otherwise we're using a target expression
                    else:
                        target_expr = cm['target_expr']

                        # If it uses row number value, add it to the row
                        if '{rownum}' in target_expr:
                            target_row.append(count_tgt_rows + 1)

            # Reraise ConfigErrors as is
            except errors.ConfigError:
                raise

            # Assume all other errors are coming from the data
            except Exception as e:
                raise errors.DataError(
                    'Input data error at line {}: {}'.format(
                        count_src_lines, str(e)))

            # Push the row to the target table (unless we're in dry-run mode)
            if not context.dry_run_mode:
                inserter.push_row(target_row)
            count_tgt_rows += 1

        # Finalise
        if not context.dry_run_mode:
            inserter.flush()
            db_conn.commit()
        logger.info(context.dry_run_prefix +
                    'Loading {}@{} finished, read {} rows, inserted {} rows.'.
                    format(target_table, target_database, count_src_lines,
                           count_tgt_rows))