def test_single_redirect_at_start_of_line_is_ignored(self): extractor = CommandExtractor('sed') node = HtmlDocument("<code>>sed 's/patt/repl/' file.txt</code>") regions = extractor.extract(node) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.start_offset, 1)
def test_extract_command_with_variables(self): extractor = CommandExtractor('wget') node = HtmlDocument('<code>VAR=val wget http://google.com</code>') regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 29)
def test_extract_from_crontab(self): extractor = CommandExtractor('wget') node = HtmlDocument("<code>*/5 * * * * wget mysite.com</code>") regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 12) self.assertEqual(r.end_offset, 26)
def test_extract_includes_redirect(self): extractor = CommandExtractor('wget') node = HtmlDocument("<code>wget google.com > /dev/null 2>&1</code>") regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 31)
def test_extract_command_by_regex(self): extractor = CommandExtractor('wget(\.exe)?') node = HtmlDocument( '<code>my-shell$ wget.exe http://google.com</code>') regions = extractor.extract(node) r = regions[0] self.assertEqual(r.start_offset, 10) self.assertEqual(r.end_offset, 35)
def test_extract_multiple_commands(self): extractor = CommandExtractor('wget(\.exe)?') node = HtmlDocument('\n'.join([ '<code>', ' wget http://google.com', ' wget http://gaggle.com', '</code>', ])) regions = extractor.extract(node) self.assertEqual(len(regions), 2)
def test_extract_command(self): extractor = CommandExtractor('wget') node = HtmlDocument('<code>wget http://google.com</code>') regions = extractor.extract(node) self.assertEqual(len(regions), 1) r = regions[0] self.assertEqual(r.node, node) self.assertEqual(r.start_offset, 0) self.assertEqual(r.end_offset, 21) self.assertEqual(r.string, 'wget http://google.com')
def test_line_breaks(self): extractor = CommandExtractor('wget') node = HtmlDocument(''.join([ '<code>$ wget http://google.com<br> ', '$ wget http://gaggle.com</code>' ])) regions = extractor.extract(node) r1 = regions[0] self.assertEqual(r1.start_offset, 3) self.assertEqual(r1.end_offset, 24) r2 = regions[1] self.assertEqual(r2.start_offset, 29) self.assertEqual(r2.end_offset, 50)
class WgetExtractor(object): def __init__(self): self.cmd_extractor = CommandExtractor(WGET_PATT) def extract(self, node): regions = self.cmd_extractor.extract(node) valid_regions = [] for r in regions: try: if self._includes_url(r.string) and self._is_not_prose(r.string): valid_regions.append(r) except InvalidCommandException as e: logging.error("Invalid command found: %s: %s", e.cmd, e.exception) return [] return valid_regions def _includes_url(self, cmd): output = run_wget(cmd) if output is None: return False contains_url = ( (re.search('^URL:', output, re.MULTILINE) and not re.search('^URL: \(null\)', output, re.MULTILINE)) or re.search('^LN: input-file', output, re.MULTILINE) ) return contains_url def _is_not_prose(self, cmdtext): url_count = 0 arg_count = 0 has_var = False command = bashlex.parse(cmdtext)[0] after_cmdname = False for part in command.parts: if after_cmdname: if hasattr(part, 'word'): if part.word.startswith('-'): arg_count += 1 else: url_count += 1 if part.word.startswith('$'): has_var = True if hasattr(part, 'word') and re.match(WGET_PATT, part.word): after_cmdname = True return has_var or arg_count > 0 or url_count == 1
def test_extract_command_ignore_PS1_line(self): extractor = CommandExtractor('wget') node = HtmlDocument('\n'.join([ '<code>', 'my-shell$ wget http://google.com', 'my-shell$ wget http://gaggle.com', '</code>', ])) regions = extractor.extract(node) r1 = regions[0] self.assertEqual(r1.start_offset, 11) self.assertEqual(r1.end_offset, 32) r2 = regions[1] self.assertEqual(r2.start_offset, 44) self.assertEqual(r2.end_offset, 65)
def test_allow_many_newlines_between_commands(self): ''' We include this test as bashlex doesn't like more than one newline charater between mulitple lines of a script. ''' extractor = CommandExtractor('wget') node = HtmlDocument('\n'.join([ '<code>', ' wget http://google.com', ' ', ' ', ' wget http://gaggle.com', '</code>', ])) regions = extractor.extract(node) self.assertEqual(len(regions), 2)
class SedRegexExtractor(object): def __init__(self): self.sed_extractor = CommandExtractor(SED_COMMAND_PATTERN) def extract(self, node): SED_ADDR_PATTERN = '(?<=Tutorons address: ).*(?=$)' SED_SUB_PATTERN = '^Tutorons substitution.*$' regions = [] def _region_from_substring(cr, substring, slash): start_offset = cr.start_offset + cr.string.find(substring) end_offset = start_offset + len(substring) - 1 pattern = substring.replace('\\' + slash, slash) region = RegexRegion(pattern, node, start_offset, end_offset, substring) return region command_regions = self.sed_extractor.extract(node) for cr in command_regions: command = cr.string args = [SED] + get_arguments(command, SED_COMMAND_PATTERN) try: output = subprocess.check_output(args, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as cpe: output = cpe.output addrs = re.findall(SED_ADDR_PATTERN, output, flags=re.MULTILINE) for addr in addrs: regions.append(_region_from_substring(cr, addr, '\\')) subst_lines = re.findall(SED_SUB_PATTERN, output, flags=re.MULTILINE) for line in subst_lines: m = re.match('^Tutorons substitution \(slash: (.)\): (.*)$', line) slash_char, patt = m.groups() patt_escaped = patt.replace(slash_char, '\\' + slash_char) regions.append(_region_from_substring(cr, patt_escaped, slash_char)) return regions
class GrepRegexExtractor(object): ''' Extracts regular expressions from grep command lines. ''' def __init__(self): self.command_extractor = CommandExtractor(GREP_COMMAND_PATTERN) def extract(self, node): ''' Regex pattern locations are NOT always exact. Because no positioning is returned by grep's parser, which we reuse here, all we can get are the patterns that will be used by grep, and then match these up to the first position the pattern appears in in the input command. ''' GREP_REGEX_PATTERN = '(?<=Tutorons string: ).*(?=$)' regions = [] command_regions = self.command_extractor.extract(node) for cr in command_regions: command = cr.string args = [GREP] + get_arguments(command, GREP_COMMAND_PATTERN) try: output = subprocess.check_output(args, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as cpe: output = cpe.output regexes = re.findall(GREP_REGEX_PATTERN, output, flags=re.MULTILINE) for r in regexes: start_offset = cr.start_offset + command.find(r) end_offset = start_offset + len(r) - 1 region = RegexRegion(r, node, start_offset, end_offset, r) regions.append(region) return regions
def test_ignore_command_name_without_options(self): extractor = CommandExtractor('wget') node = HtmlDocument('<code>wget</code>') regions = extractor.extract(node) self.assertEqual(len(regions), 0)
def test_handles_parsing_error(self): extractor = CommandExtractor('wget') extractor.extract( HtmlDocument('<code>os.system("wget google.com")</code>'))