def run(self, config) -> dict: """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] xslt_param = config['xslt_param', 'xslt'] output_param = config['output_param', 'data'] pretty_output = config['pretty_output', False] # Read and parse the input and XSLT data data_dom = lxml.etree.fromstring(config[input_param].encode()) xslt_dom = lxml.etree.fromstring(config[xslt_param].encode()) # Are we producing plain text output? as_plain_text = 'text' == xslt_dom.xpath( 'string(/xsl:stylesheet/xsl:output/@method)', namespaces={'xsl': 'http://www.w3.org/1999/XSL/Transform'}) # Construct and apply the transformation transform = lxml.etree.XSLT(xslt_dom) result_dom = transform(data_dom) # Convert the transformation result into string if as_plain_text: result = str(result_dom) else: result = lxml.etree.tostring(result_dom, encoding=str, pretty_print=pretty_output) if result is None: result = '' logger.log("Done, XSLT transformation result is {} chars".format(len(result))) # Return the result return {output_param: result}
def run(self, config) -> dict: """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] output_param = config['output_param', 'data'] data = config[input_param] rules = config['rules'] size_in = len(data) # Iterate through rules for rule in rules: search = rule['search'] replace = rule['replace'] count = int(rule['count', 0]) is_regex = bool(rule['is_regex', False]) if is_regex: data = re.sub(search, replace, data, count) else: data = data.replace(search, replace, count if count > 0 else -1) logger.log('Done. Input: {} bytes, output: {} bytes.'.format( size_in, len(data))) # Return the result return {output_param: data}
def run(self, config) -> dict: """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] output_param = config['output_param', 'data'] data = config[input_param] regex = config['regex'] group = int(config['group_num']) unique = bool(config['unique', False]) # Collect all matches matches = [match.group(group) for match in re.finditer(regex, data)] # Deduplicate matches if needed, keeping the order if unique: unique_matches = [] for m in matches: if m not in unique_matches: unique_matches.append(m) matches = unique_matches logger.log("Found {}{} match(es)".format(len(matches), ' unique' if unique else '')) # Return the result return {output_param: '\n'.join(matches) + '\n'}
def run(self, config): """Override the abstract method of the base class.""" # Fetch the config parameters input_param = config['input_param', 'data'] output_param = config['output_param', 'data'] passthrough_params = config['passthrough_params', None] chomp = bool(config['chomp', True]) skip_blank_lines = bool(config['skip_blank_lines', True]) # Prepare child params if passthrough_params is None: sub_params = {} else: sub_params = {k: config[k] for k in passthrough_params} # Read the input data line_num = 0 for line in config.lines(input_param): # Strip the linebreak if needed if chomp: line = line.rstrip('\r\n') # Invoke the handler if not skip_blank_lines or len(line) > 0: line_num += 1 logger.log('Processing line #{}'.format(line_num)) sub_params[output_param] = line context.invoke_handler(config['handler'], sub_params)
def _check_logger_file(severity): """Output a specific message via logger to a file and check its result.""" global TEST_MSG # Get a temp file name temp_handle, temp_file_name = tempfile.mkstemp() print('Temporary file used: ' + temp_file_name) try: # Close it right away os.close(temp_handle) # Log to the file and close it logger.set_log_file(temp_file_name) try: logger.log(TEST_MSG, severity) finally: logger.set_log_file(None) # The file must contain the original message somewhere with open(temp_file_name) as f: captured = f.read() print('Logger output: "' + captured + '"') assert TEST_MSG in captured finally: # Remove the file os.remove(temp_file_name)
def run(self, config) -> dict: """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] output_param = config['output_param', 'data'] src_data = config.lines(input_param) start_line = int(config['start_line', 1]) merge_cnt = int(config['num_lines_to_merge']) trim_lines = bool(config['trim_lines', False]) skip_blank_lines = bool(config['skip_blank_lines', False]) delimiter = config['delimiter', ''] # Iterate through data lines tgt_data = io.StringIO() count_src_lines = 0 count_tgt_lines = 0 count_merged = 0 merged_line = '' for src_line in src_data: # Skip up to start_line count_src_lines += 1 if count_src_lines < start_line: continue # Strip all whitespace if needed, otherwise only the terminating linebreak line = src_line.strip() if trim_lines else src_line.rstrip('\r\n') # If the line is blank and we're skipping them if skip_blank_lines and line == '': continue # Merge lines count_merged += 1 if count_merged > 1: merged_line += delimiter merged_line += line # If merge threshold is reached if count_merged == merge_cnt: tgt_data.write(merged_line + '\n') count_tgt_lines += 1 count_merged = 0 merged_line = '' # Flush the possible remaining merged lines if count_merged > 0: tgt_data.write(merged_line + '\n') count_tgt_lines += 1 logger.log('Done. Input: {} lines, output: {} lines.'.format( count_src_lines, count_tgt_lines)) # Return the result return {output_param: tgt_data.getvalue()}
def run(self, config) -> dict: """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] output_param = config['output_param', 'data'] # Read the input data lines = sorted(config.lines_list(input_param), reverse=bool(config['reverse', False])) logger.log("Sorted {} lines".format(len(lines))) # Return the result return {output_param: '\n'.join(lines) + '\n'}
def run(self, config): """Override the abstract method of the base class.""" # Get attributes from config output_param = config['output_param', 'data'] db_name = config['database'] fld_delimiter = config['field_delimiter', ''] rec_delimiter = config['record_delimiter', '\n'] col_headers = config['col_headers', False] quotechar = config['quotechar', None] sql = config['sql'] params = config['params', None] # Substitute params in the DB connection and find it db_name = db_name.format(**config) db_conn = context.get_db_connection(db_name) # Prepare quote strings qopen = self.QUOTE_OPEN[ quotechar] if quotechar in self.QUOTE_OPEN else quotechar qclose = self.QUOTE_CLOSE[ quotechar] if quotechar in self.QUOTE_CLOSE else quotechar # Prepare parameters, if needed db_params = {} if params is not None: for param in params: p_name = param['name'] p_value = param['value'] db_params[p_name] = p_value.format(**config) # Execute the query cur = db_conn.cursor() records = [] try: cur.execute(sql, db_params) # Output column headers, if needed if col_headers: records.append( fld_delimiter.join( [qopen + d[0] + qclose for d in cur.description])) # Read the returned data for record in cur: records.append( fld_delimiter.join( [qopen + str(v) + qclose for v in record])) finally: cur.close() logger.log('Done. Read {} rows from {}'.format(len(records), db_name)) return {output_param: rec_delimiter.join(records) + rec_delimiter}
def run(self, config): """Override the abstract method of the base class.""" # Get attributes from config db_name = config['database'] sql = config['sql'] params = config['params', None] commit_stmt = config['commit_stmt', 'each'] if commit_stmt == 'none': commit_policy = self.COMMIT_NONE elif commit_stmt == 'each': commit_policy = self.COMMIT_EACH elif commit_stmt == 'all': commit_policy = self.COMMIT_ALL else: raise errors.ConfigError('Invalid value for commit_stmt: "{}"'.format(commit_stmt)) # Substitute params in the DB connection and find it db_name = db_name.format(**config) db_conn = context.get_db_connection(db_name) # Initiate a transaction, if all statements are committed at once if not context.dry_run_mode and commit_policy == self.COMMIT_ALL: db_conn.begin() # Prepare statement(s). If a single statement is given, transform it into a single-element list if type(sql) is not list: sql = [sql] # Prepare parameters, if needed db_params = {} if params is not None: for param in params: p_name = param['name'] p_value = param['value'] db_params[p_name] = p_value.format(**config) # Execute the statement(s) if not context.dry_run_mode: for stmt in sql: db_conn.execute(stmt, db_params) # Commit if needed if commit_policy == self.COMMIT_EACH: db_conn.commit() # Execute a commit, if all statements are committed at once if not context.dry_run_mode and commit_policy == self.COMMIT_ALL: db_conn.commit() logger.log(context.dry_run_prefix + 'Done. {} statement(s) executed on {}'.format(len(sql), db_name))
def fetch(url: str, required: bool, verify_cert: bool, detect_compressed: bool, username: str, pwd: str, encoding: str) -> str: """Fetch a file at the specified URL from the server, decompress if necessary and return its contents. :param url: URL to fetch. :param required: Whether to raise an exception on HTTP 404 error. :param verify_cert: Whether to enforce SSL certificate check. :param detect_compressed: Whether to detect and decompress compressed files. :param username: Username to use for authentication. If None, authentication is not used. :param pwd: Password to use for authentication. Ignored if username is None. :param encoding: Encoding to use when decoding the HTTP data. :return: File contents as text. """ data = utils.http_fetch(url, required, verify_cert, username, pwd) if data is None: logger.log('File {} doesn\'t exist') return None else: size_in = len(data) logger.log('Downloaded {} ({} bytes)'.format(url, size_in)) # Check for gzip header if needed, and decompress is it's there if detect_compressed and size_in > 10 and data[0] == 0x1F and data[ 1] == 0x8B: compressed_file = BytesIO(data) compressed_file.seek(0) decompressed_file = GzipFile(fileobj=compressed_file) data = decompressed_file.read() logger.log( 'Decompressed the file, raw size is {} bytes'.format( len(data))) # Convert the binary data into text return data.decode(encoding)
def run(self, config) -> dict: """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] output_param = config['output_param', 'data'] rejected_param = config['rejected_param', None] start_line = int(config['start_line', 1]) skip_blank_lines = bool(config['skip_blank_lines', False]) criteria = config['criteria', None] # Convert a single criterion to a one-item list if criteria is not None and type(criteria) is not list: criteria = [criteria] # Process the input lines count_src_lines = 0 tgt_lines = [] rej_lines = [] for line in config.lines_list(input_param): # Skip up to start_line count_src_lines += 1 if count_src_lines < start_line: continue # Skip blank lines if skip_blank_lines and len(line) == 0: continue # Process criteria, if any do_include = True if criteria is not None: for criterion in criteria: cr_search = criterion['search'] cr_is_regex = bool(criterion['is_regex', False]) cr_substitute_params = bool(criterion['substitute_params', False]) cr_negate = bool(criterion['negate', False]) # Substitute params if needed if cr_substitute_params: cr_search = cr_search.format(**config) # Run the matching match = re.search(cr_search, line) is not None if cr_is_regex else cr_search in line # If the matching failed (it's an XOR condition effectively) if cr_negate == match: do_include = False break # Add an output line, if needed if do_include: tgt_lines.append(line) # Otherwise accumulate rejected line, if needed elif rejected_param is not None: rej_lines.append(line) # Log the stats and prepare the result result = {output_param: '\n'.join(tgt_lines) + '\n'} if rejected_param is None: logger.log("Read {} input lines, kept {} lines".format(count_src_lines, len(tgt_lines))) else: logger.log( "Read {} input lines, kept {}, rejected {} lines".format( count_src_lines, len(tgt_lines), len(rej_lines))) result[rejected_param] = '\n'.join(rej_lines) + '\n' # Return the result return result
def run(self, config): """Override the abstract method of the base class.""" # Get attributes from config input_param = config['input_param', 'data'] data = config.lines(input_param) delimiter = None quotechar = None # Get and validate data format data_format = config['format'] fmt_fixed = data_format == 'fixed' fmt_delimited = data_format == 'delimited' if not fmt_fixed and not fmt_delimited: raise errors.ConfigError( 'Invalid format value: "{}".'.format(data_format)) if fmt_delimited: delimiter = config['delimiter'] quotechar = config['quotechar', None] # Fetch other config attributes start_line = int(config['start_line', 1]) target_database = config['target_database'] target_table = config['target_table'] truncate_target = bool(config['truncate_target', False]) column_mappings = config['column_mappings'] # Substitute params in the DB connection target_database = target_database.format(**config) logger.log( context.dry_run_prefix + 'Loading data to {}@{}'.format(target_table, target_database)) # Find the DB connection db_conn = context.get_db_connection(target_database) # Truncate the target table, if required if truncate_target: if not context.dry_run_mode: db_conn.execute( 'truncate table {} drop storage'.format(target_table)) logger.log(context.dry_run_prefix + 'Table {}@{} is truncated'.format( target_table, target_database)) # Create an inserter inserter = self.get_inserter(db_conn, target_table, column_mappings, fmt_fixed, fmt_delimited, config) # Fixed-width file: use the input data as is if fmt_fixed: input_data = data # Delimited file: open the input data as a CSV file else: args = {'delimiter': delimiter, 'strict': True} if quotechar is not None: args['quotechar'] = quotechar input_data = csv.reader(data, **args) # Iterate through the data count_src_lines = 0 count_tgt_rows = 0 for src_row in input_data: # Skip up to start_line count_src_lines += 1 if count_src_lines < start_line: continue # Fetch data values try: target_row = [] for cm in column_mappings: # If a source value is used if (fmt_fixed and 'source_pos' in cm) or (fmt_delimited and 'source_index' in cm): col_name = cm['name'] datatype = cm['datatype'] trim = cm['source_trim', 'none'] # Fixed-width if fmt_fixed: pos = cm['source_pos'].partition(':') # Validate the position format if pos[0] == '' or pos[2] == '': raise errors.ConfigError( 'Invalid position specifier "{}" for column "{}"' .format(cm['source_pos'], col_name)) # Validate left boundary try: pos_l = int(pos[0]) - 1 except ValueError as e: raise errors.ConfigError( 'Left boundary specification for column "{}": {}' .format(col_name, str(e))) if pos_l < 0: raise errors.ConfigError( 'Left boundary must be positive (column "{}")' .format(col_name)) # Validate right boundary try: pos_r = int(pos[2]) except ValueError as e: raise errors.ConfigError( 'Right boundary specification for column "{}": {}' .format(col_name, str(e))) if pos_r <= pos_l: raise errors.ConfigError( 'Right boundary must be greater than or equal to the left one (column "{}")' .format(col_name)) # Chomp src_row = src_row.rstrip('\r\n') # Extract column value value = src_row[pos_l:pos_r] # Delimited else: value = src_row[int(cm['source_index'])] # Apply trimming if trim == 'none': pass elif trim == 'left': value = value.lstrip() elif trim == 'right': value = value.rstrip() elif trim == 'both': value = value.strip() else: raise errors.ConfigError( 'Invalid trim value for column "{}": "{}"'. format(col_name, trim)) # All types: handle null values val_len = len(value) if val_len == 0: value = None # String: validate value length elif datatype == 'string': max_len = int(cm['length']) # If the length exceeds the allowed size if val_len > max_len: # No truncation - raise an error if not bool(cm['truncate', False]): raise errors.DataError( 'String column "{}": value length ({}) exceeds allowed maximum ({})' .format(col_name, val_len, max_len)) # Otherwise truncate the value value = value[:max_len] # Integer: convert to actual int elif datatype == 'integer': try: value = int(value) except ValueError as e: raise errors.DataError( 'Integer column "{}": {}'.format( col_name, str(e))) # Number: convert to actual number elif datatype == 'number': try: value = float(value) except ValueError as e: raise errors.DataError( 'Float value error for column "{}": {}'. format(col_name, str(e))) # Add the value to the target row target_row.append(value) # Otherwise we're using a target expression else: target_expr = cm['target_expr'] # If it uses row number value, add it to the row if '{rownum}' in target_expr: target_row.append(count_tgt_rows + 1) # Reraise ConfigErrors as is except errors.ConfigError: raise # Assume all other errors are coming from the data except Exception as e: raise errors.DataError( 'Input data error at line {}: {}'.format( count_src_lines, str(e))) # Push the row to the target table (unless we're in dry-run mode) if not context.dry_run_mode: inserter.push_row(target_row) count_tgt_rows += 1 # Finalise if not context.dry_run_mode: inserter.flush() db_conn.commit() logger.info(context.dry_run_prefix + 'Loading {}@{} finished, read {} rows, inserted {} rows.'. format(target_table, target_database, count_src_lines, count_tgt_rows))