コード例 #1
0
 def test_single_redirect_at_start_of_line_is_ignored(self):
     extractor = CommandExtractor('sed')
     node = HtmlDocument("<code>>sed 's/patt/repl/' file.txt</code>")
     regions = extractor.extract(node)
     self.assertEqual(len(regions), 1)
     r = regions[0]
     self.assertEqual(r.start_offset, 1)
コード例 #2
0
 def test_extract_command_with_variables(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument('<code>VAR=val wget http://google.com</code>')
     regions = extractor.extract(node)
     r = regions[0]
     self.assertEqual(r.start_offset, 0)
     self.assertEqual(r.end_offset, 29)
コード例 #3
0
 def test_extract_from_crontab(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument("<code>*/5 * * * * wget mysite.com</code>")
     regions = extractor.extract(node)
     r = regions[0]
     self.assertEqual(r.start_offset, 12)
     self.assertEqual(r.end_offset, 26)
コード例 #4
0
 def test_extract_includes_redirect(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument("<code>wget google.com > /dev/null 2>&1</code>")
     regions = extractor.extract(node)
     r = regions[0]
     self.assertEqual(r.start_offset, 0)
     self.assertEqual(r.end_offset, 31)
コード例 #5
0
 def test_extract_command_by_regex(self):
     extractor = CommandExtractor('wget(\.exe)?')
     node = HtmlDocument(
         '<code>my-shell$ wget.exe http://google.com</code>')
     regions = extractor.extract(node)
     r = regions[0]
     self.assertEqual(r.start_offset, 10)
     self.assertEqual(r.end_offset, 35)
コード例 #6
0
 def test_extract_multiple_commands(self):
     extractor = CommandExtractor('wget(\.exe)?')
     node = HtmlDocument('\n'.join([
         '<code>',
         '    wget http://google.com',
         '    wget http://gaggle.com',
         '</code>',
     ]))
     regions = extractor.extract(node)
     self.assertEqual(len(regions), 2)
コード例 #7
0
 def test_extract_command(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument('<code>wget http://google.com</code>')
     regions = extractor.extract(node)
     self.assertEqual(len(regions), 1)
     r = regions[0]
     self.assertEqual(r.node, node)
     self.assertEqual(r.start_offset, 0)
     self.assertEqual(r.end_offset, 21)
     self.assertEqual(r.string, 'wget http://google.com')
コード例 #8
0
 def test_line_breaks(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument(''.join([
         '<code>$  wget http://google.com<br> ',
         '$  wget http://gaggle.com</code>'
     ]))
     regions = extractor.extract(node)
     r1 = regions[0]
     self.assertEqual(r1.start_offset, 3)
     self.assertEqual(r1.end_offset, 24)
     r2 = regions[1]
     self.assertEqual(r2.start_offset, 29)
     self.assertEqual(r2.end_offset, 50)
コード例 #9
0
ファイル: explain.py プロジェクト: jimmytuc/tutorons-server
class WgetExtractor(object):

    def __init__(self):
        self.cmd_extractor = CommandExtractor(WGET_PATT)

    def extract(self, node):
        regions = self.cmd_extractor.extract(node)
        valid_regions = []
        for r in regions:
            try:
                if self._includes_url(r.string) and self._is_not_prose(r.string):
                    valid_regions.append(r)
            except InvalidCommandException as e:
                logging.error("Invalid command found: %s: %s", e.cmd, e.exception)
                return []
        return valid_regions

    def _includes_url(self, cmd):
        output = run_wget(cmd)
        if output is None:
            return False
        contains_url = (
            (re.search('^URL:', output, re.MULTILINE) and
                not re.search('^URL: \(null\)', output, re.MULTILINE)) or
            re.search('^LN: input-file', output, re.MULTILINE)
        )
        return contains_url

    def _is_not_prose(self, cmdtext):

        url_count = 0
        arg_count = 0
        has_var = False

        command = bashlex.parse(cmdtext)[0]
        after_cmdname = False

        for part in command.parts:
            if after_cmdname:
                if hasattr(part, 'word'):
                    if part.word.startswith('-'):
                        arg_count += 1
                    else:
                        url_count += 1
                    if part.word.startswith('$'):
                        has_var = True

            if hasattr(part, 'word') and re.match(WGET_PATT, part.word):
                after_cmdname = True

        return has_var or arg_count > 0 or url_count == 1
コード例 #10
0
 def test_extract_command_ignore_PS1_line(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument('\n'.join([
         '<code>',
         'my-shell$ wget http://google.com',
         'my-shell$ wget http://gaggle.com',
         '</code>',
     ]))
     regions = extractor.extract(node)
     r1 = regions[0]
     self.assertEqual(r1.start_offset, 11)
     self.assertEqual(r1.end_offset, 32)
     r2 = regions[1]
     self.assertEqual(r2.start_offset, 44)
     self.assertEqual(r2.end_offset, 65)
コード例 #11
0
 def test_allow_many_newlines_between_commands(self):
     '''
     We include this test as bashlex doesn't like more than one newline
     charater between mulitple lines of a script.
     '''
     extractor = CommandExtractor('wget')
     node = HtmlDocument('\n'.join([
         '<code>',
         '    wget http://google.com',
         '                          ',
         '                          ',
         '    wget http://gaggle.com',
         '</code>',
     ]))
     regions = extractor.extract(node)
     self.assertEqual(len(regions), 2)
コード例 #12
0
class SedRegexExtractor(object):

    def __init__(self):
        self.sed_extractor = CommandExtractor(SED_COMMAND_PATTERN)

    def extract(self, node):

        SED_ADDR_PATTERN = '(?<=Tutorons address: ).*(?=$)'
        SED_SUB_PATTERN = '^Tutorons substitution.*$'
        regions = []

        def _region_from_substring(cr, substring, slash):
            start_offset = cr.start_offset + cr.string.find(substring)
            end_offset = start_offset + len(substring) - 1
            pattern = substring.replace('\\' + slash, slash)
            region = RegexRegion(pattern, node, start_offset, end_offset, substring)
            return region

        command_regions = self.sed_extractor.extract(node)
        for cr in command_regions:

            command = cr.string
            args = [SED] + get_arguments(command, SED_COMMAND_PATTERN)
            try:
                output = subprocess.check_output(args, stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError as cpe:
                output = cpe.output

            addrs = re.findall(SED_ADDR_PATTERN, output, flags=re.MULTILINE)
            for addr in addrs:
                regions.append(_region_from_substring(cr, addr, '\\'))

            subst_lines = re.findall(SED_SUB_PATTERN, output, flags=re.MULTILINE)
            for line in subst_lines:
                m = re.match('^Tutorons substitution \(slash: (.)\): (.*)$', line)
                slash_char, patt = m.groups()
                patt_escaped = patt.replace(slash_char, '\\' + slash_char)
                regions.append(_region_from_substring(cr, patt_escaped, slash_char))

        return regions
コード例 #13
0
class GrepRegexExtractor(object):
    ''' Extracts regular expressions from grep command lines. '''

    def __init__(self):
        self.command_extractor = CommandExtractor(GREP_COMMAND_PATTERN)

    def extract(self, node):
        '''
        Regex pattern locations are NOT always exact.
        Because no positioning is returned by grep's parser, which we reuse here,
        all we can get are the patterns that will be used by grep, and then match
        these up to the first position the pattern appears in in the input command.
        '''

        GREP_REGEX_PATTERN = '(?<=Tutorons string: ).*(?=$)'
        regions = []

        command_regions = self.command_extractor.extract(node)

        for cr in command_regions:

            command = cr.string
            args = [GREP] + get_arguments(command, GREP_COMMAND_PATTERN)
            try:
                output = subprocess.check_output(args, stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError as cpe:
                output = cpe.output

            regexes = re.findall(GREP_REGEX_PATTERN, output, flags=re.MULTILINE)
            for r in regexes:
                start_offset = cr.start_offset + command.find(r)
                end_offset = start_offset + len(r) - 1
                region = RegexRegion(r, node, start_offset, end_offset, r)
                regions.append(region)

        return regions
コード例 #14
0
 def test_ignore_command_name_without_options(self):
     extractor = CommandExtractor('wget')
     node = HtmlDocument('<code>wget</code>')
     regions = extractor.extract(node)
     self.assertEqual(len(regions), 0)
コード例 #15
0
 def test_handles_parsing_error(self):
     extractor = CommandExtractor('wget')
     extractor.extract(
         HtmlDocument('<code>os.system("wget google.com")</code>'))