def main(pattern_filename, input_filenames, pattern_format, output_filename, encoding, words, by_line, slow, verbose, quiet): ''' Search and replace on INPUT_FILE(s) (or standard input), with matching on fixed strings. ''' set_log_level(verbose, quiet) if slow: by_line = True by_line = True # TODO: implement non-line-based rewriting # load the patterns LOGGER.info('fsed {} input {} output {}'.format(pattern_filename, input_filenames, output_filename)) if not input_filenames: input_filenames = ('-',) if not output_filename: output_filename = '-' # build trie machine for matching trie, boundaries = build_trie(pattern_filename, pattern_format, encoding, words) if not slow: warn_prefix_values(trie) LOGGER.info('writing to {}'.format(output_filename)) with open_file(output_filename, 'wb') as output_file: for input_filename in input_filenames: # search and replace with open_file(input_filename) as input_file: LOGGER.info('reading {}'.format(input_filename)) if by_line: num_lines = 0 for line in input_file: line = line.decode(encoding).rstrip('\n') line = rewrite_str_with_trie(line, trie, boundaries, slow) output_file.write((line + '\n').encode(encoding)) num_lines += 1 LOGGER.info('{} lines written'.format(num_lines)) else: raise NotImplementedError
def detect_pattern_format(pattern_filename, encoding, on_word_boundaries): ''' Automatically detects the pattern file format, and determines whether the Aho-Corasick string matching should pay attention to word boundaries or not. Arguments: - `pattern_filename`: - `encoding`: - `on_word_boundaries`: ''' tsv = True boundaries = on_word_boundaries with open_file(pattern_filename) as input_file: for line in input_file: line = line.decode(encoding) if line.count('\t') != 1: tsv = False if '\\b' in line: boundaries = True if boundaries and not tsv: break return tsv, boundaries
def build_trie(pattern_filename, pattern_format, encoding, on_word_boundaries): ''' Constructs a finite state machine for performing string rewriting. Arguments: - `pattern_filename`: - `pattern_format`: - `encoding`: - `on_word_boundaries`: ''' boundaries = on_word_boundaries if pattern_format == 'auto' or not on_word_boundaries: tsv, boundaries = detect_pattern_format(pattern_filename, encoding, on_word_boundaries) if pattern_format == 'auto': if tsv: pattern_format = 'tsv' else: pattern_format = 'sed' trie = fsed.ahocorasick.AhoCorasickTrie() num_candidates = 0 with open_file(pattern_filename) as pattern_file: for lineno, line in enumerate(pattern_file): line = line.decode(encoding).rstrip('\n') if not line.strip(): continue # decode the line if pattern_format == 'tsv': fields = line.split('\t') if len(fields) != 2: LOGGER.warning(('skipping line {} of pattern file (not ' 'in tab-separated format): {}').format(lineno, line)) continue before, after = fields elif pattern_format == 'sed': before = after = None line = line.lstrip() if line[0] == 's': delim = line[1] # delim might be a regex special character; # escape it if necessary if delim in '.^$*+?[](){}|\\': delim = '\\' + delim fields = re.split(r'(?<!\\){}'.format(delim), line) if len(fields) == 4: before, after = fields[1], fields[2] before = re.sub(r'(?<!\\)\\{}'.format(delim), delim, before) after = re.sub(r'(?<!\\)\\{}'.format(delim), delim, after) if before is None or after is None: LOGGER.warning(('skipping line {} of pattern file (not ' 'in sed format): {}').format(lineno, line)) continue num_candidates += 1 if on_word_boundaries and before != before.strip(): LOGGER.warning(('before pattern on line {} padded whitespace; ' 'this may interact strangely with the --words ' 'option: {}').format(lineno, line)) before = sub_escapes(before) after = sub_escapes(after) if boundaries: before = fsed.ahocorasick.boundary_transform(before, on_word_boundaries) trie[before] = after LOGGER.info('{} patterns loaded from {}'.format(num_candidates, pattern_filename)) return trie, boundaries