Esempio n. 1
0
    def extract_shebang(cls, data):
        """
        Internal: Extract normalized shebang command token.

        Examples

          extract_shebang("#!/usr/bin/ruby")
          # => "ruby"

          extract_shebang("#!/usr/bin/env node")
          # => "node"

        Returns String token or nil it couldn't be parsed.
        """
        s = StringScanner(data)
        path = s.scan(r'^#!\s*\S+')
        if path:
            script = path.split('/')[-1]
            if script == 'env':
                s.scan(r'\s+')
                script = s.scan(r'\S+')
            if script:
                script = re.compile(r'[^\d]+').match(script).group(0)
            return script
        return
Esempio n. 2
0
class StringParser(object):
    def __init__(self, lexicon, str_):
        self.lexicon = lexicon
        self.scanner = StringScanner(str_)

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        while not self.scanner.eos():
            pos = self.scanner.pos
            for pattern, cls in self.lexicon:
                if self.scanner.check(pattern):
                    return cls(
                        self.scanner.scan(pattern),
                        start=self.scanner.prev_pos,
                        end=self.scanner.pos,
                    )

            if self.scanner.pos == pos:
                raise Exception("Cannot tokenize:\n{}".format(
                    self.scanner.rest()))

        raise StopIteration()

    def tokenize(self):
        return [token for token in self]
 def test_skip(self):
     s = StringScanner('test string')
     self.assertEquals(s.skip(r'\w+'), 4)
     self.assertEquals(s.skip(r'\w+'), None)
     self.assertEquals(s.skip(r'\s+'), 1)
     self.assertEquals(s.skip(r'\w+'), 6)
     self.assertEquals(s.skip(r'.'), None)
 def test_concat(self):
     source = "Fri Dec 12 1975 14:39"
     s = StringScanner(source)
     s.scan(r'Fri ')
     self.assertRaises(TypeError, s.concat, 1)
     #s += ' +1000 GMT'
     s.concat(' +1000 GMT')
     self.assertEquals(s.string, 'Fri Dec 12 1975 14:39 +1000 GMT')
    def test_construction(self):
        self.assertRaises(TypeError, StringScanner, 1)
        self.assertEquals(StringScanner('dont care').string, 'dont care')
        self.assertEquals(StringScanner().pos, 0)
        self.assertEquals(StringScanner().string, None)

        _ = StringScanner()
        self.assertRaises(TypeError, _.string, 1)
        _.string = 'dont care'
        self.assertEquals(_.string, 'dont care')
        self.assertEquals(_.pos, 0)
Esempio n. 6
0
 def __init__(self, source):
     if type(source) is not type(''):
         raise TypeError('Type %s is not supported'%type(source))
     self.__source = source
     self.__formats = None
     self.__scanner = StringScanner(source)
     self.__comments = []
     self.__ignores = []
Esempio n. 7
0
    def extract_shebang(cls, data):
        """
        Internal: Extract normalized shebang command token.

        Examples

          extract_shebang("#!/usr/bin/ruby")
          # => "ruby"

          extract_shebang("#!/usr/bin/env node")
          # => "node"

        Returns String token or nil it couldn't be parsed.
        """
        s = StringScanner(data)
        path = s.scan(r'^#!\s*\S+')
        if path:
            script = path.split('/')[-1]
            if script == 'env':
                s.scan(r'\s+')
                script = s.scan(r'\S+')
            if script:
                script = re.compile(r'[^\d]+').match(script).group(0)
            return script
        return
 def test_pre_match_post_match_property(self):
     s = StringScanner('test string')
     self.assertEquals(s.scan(r'\w+'), 'test')
     self.assertEquals(s.scan(r'\s+'), ' ')
     self.assertEquals(s.pre_match, 'test')
     self.assertEquals(s.post_match, 'string')
 def test_scan_until(self):
     s = StringScanner("Fri Dec 12 1975 14:39")
     self.assertEquals(s.scan_until(r'1'), "Fri Dec 1")
     self.assertEquals(s.pre_match, 'Fri Dec ')
     self.assertIsNone(s.scan_until(r'XYZ'))
Esempio n. 10
0
    def test_scan(self):
        source = 'This is an example string'
        s = StringScanner(source)
        self.assertFalse(s.eos())

        s.pos = len(source)
        self.assertTrue(s.eos())

        s.pos = 0
        self.assertEquals(s.scan(r'\w+'), 'This')
        self.assertIsNone(s.scan(r'\w+'))
        self.assertEquals(s.scan(r'\s+'), ' ')
        self.assertIsNone(s.scan(r'\s+'))
        self.assertEquals(s.scan(r'\w+'), 'is')
        self.assertFalse(s.eos())
        self.assertEquals(s.pos, 7)
        self.assertEquals(s.scan(r'\s+'), ' ')
        self.assertEquals(s.scan(r'\w+'), 'an')
        self.assertEquals(s.scan(r'\s+'), ' ')
        self.assertEquals(s.scan(r'\w+'), 'example')
        self.assertEquals(s.scan(r'\s+'), ' ')
        self.assertEquals(s.scan(r'\w+'), 'string')
        self.assertTrue(s.eos())
        self.assertIsNone(s.scan(r'\s+'))
        self.assertIsNone(s.scan(r'\w+'))
Esempio n. 11
0
 def test_skip_until(self):
     s = StringScanner("Fri Dec 12 1975 14:39")
     self.assertEquals(s.skip_until(r'12'), 10)
Esempio n. 12
0
    def extract_tokens(self, data):
        """
        Internal: Extract generic tokens from data.

        data - String to scan.

        Examples

          extract_tokens("printf('Hello')")
          # => ['printf', '(', ')']

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []
        while not s.is_eos:
            if s.pos >= BYTE_LIMIT: break
            token = s.scan(r'^#!.+')
            if token:
                name = self.extract_shebang(token)
                if name:
                    tokens.append('SHEBANG#!%s' % name)
                    continue

            # Single line comment
            if s.is_beginning_of_line and s.scan(START_SINGLE_LINE_COMMENT):
                s.skip_until(r'\n|\Z')
                continue

            # Multiline comments
            token = s.scan(START_MULTI_LINE_COMMENT)
            if token:
                close_token = dict(MULTI_LINE_COMMENTS).get(token)
                s.skip_until(re.compile(re.escape(close_token)))
                continue

            # Skip single or double quoted strings
            if s.scan(r'"'):
                if s.peek(1) == '"':
                    s.getch
                else:
                    s.skip_until(r'[^\\]"')
            if s.scan(r"'"):
                if s.peek(1) == "'":
                    s.getch
                else:
                    s.skip_until(r"[^\\]'")

            # Skip number literals
            if s.scan(r'(0x)?\d(\d|\.)*'):
                continue

            # SGML style brackets
            token = s.scan(r'<[^\s<>][^<>]*>')
            if token:
                for t in self.extract_sgml_tokens(token):
                    tokens.append(t)
                continue

            # Common programming punctuation
            token = s.scan(r';|\{|\}|\(|\)|\[|\]')
            if token:
                tokens.append(token)
                continue

            # Regular token
            token = s.scan(r'[\w\.@#\/\*]+')
            if token:
                tokens.append(token)
                continue

            # Common operators
            token = s.scan(r'<<?|\+|\-|\*|\/|%|&&?|\|\|?')
            if token:
                tokens.append(token)
                continue

            s.getch
        return tokens
Esempio n. 13
0
    def extract_sgml_tokens(self, data):
        """
        Internal: Extract tokens from inside SGML tag.

        data - SGML tag String.

            Examples

              extract_sgml_tokens("<a href='' class=foo>")
              # => ["<a>", "href="]

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []

        while not s.is_eos:
            # Emit start token
            token = s.scan(r'<\/?[^\s>]+')
            if token:
                tokens.append(token + '>')
                continue

            # Emit attributes with trailing =
            token = s.scan(r'\w+=')
            if token:
                tokens.append(token)

                # Then skip over attribute value
                if s.scan('"'):
                    s.skip_until(r'[^\\]"')
                    continue
                if s.scan("'"):
                    s.skip_until(r"[^\\]'")
                    continue
                s.skip_until(r'\w+')
                continue

            # Emit lone attributes
            token = s.scan(r'\w+')
            if token:
                tokens.append(token)

            # Stop at the end of the tag
            if s.scan('>'):
                s.terminate
                continue

            s.getch

        return tokens
Esempio n. 14
0
 def __init__(self, lexicon, str_):
     self.lexicon = lexicon
     self.scanner = StringScanner(str_)
Esempio n. 15
0
class AbstractExtractor(object):
    __metaclass__ = ABCMeta
    __definitions = []

    def __init__(self, source):
        if type(source) is not type(''):
            raise TypeError('Type %s is not supported'%type(source))
        self.__source = source
        self.__formats = None
        self.__scanner = StringScanner(source)
        self.__comments = []
        self.__ignores = []

    #@classmethod
    @abstractmethod
    def get_definitions(cls):
        return cls.__definitions
    
    def get_comment_definitions(self):
        return filter(lambda x:x['type']=='comment', self.get_definitions())
        #return self.__definitions['comments']

    @property
    def comments(self):
        return sorted(set(self.__comments), key=lambda comment:comment[0])

    @property
    def ignores(self):
        return sorted(set(self.__ignores), key=lambda comment:comment[0])

    def extract(self):
        pass

    def scan_block_comments(self):
        for block_comment in [definition for definition in self.get_definitions() if definition['block']]:
            self.__comments += self.scan(block_comment['startwith'], block_comment['endwith'], block_comment['block'])

    def scan_line_comments(self):
        for line_comment in [definition for definition in self.get_definitions() if definition['block'] is False]:
            self.__comments += self.scan(line_comment['startwith'], None, line_comment['block'])


    def scan_comments(self):
        scan_block_comments()
        scan_line_comments()

    def scan_ignore(self):
        pass

    def scan(self, startwith, endwith, block):
        result = list()
        while not self.__scanner.eos():
            if self.__scanner.skip_until(re.escape(startwith)) is None:
                self.__scanner.skip('.*$', re.S)
                continue
            spos = self.__scanner.pos - len(startwith)

            if block is True: # block comment
                if self.__scanner.skip_until(re.escape(endwith)) is None:
                    raise SyntacticError
                    self.__scanner.skip('.*$', re.S)
                    continue
            else: # line comment
                self.__scanner.skip_until(r'.*$', re.M)
                
            epos = self.__scanner.pos
            result.append((spos, epos))
            print '>>>> %d:%d'%(spos, epos)
            print self.__scanner.string[spos:epos]
            print '<<<<'
        return result
Esempio n. 16
0
    def extract_sgml_tokens(self, data):
        """
        Internal: Extract tokens from inside SGML tag.

        data - SGML tag String.

            Examples

              extract_sgml_tokens("<a href='' class=foo>")
              # => ["<a>", "href="]

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []

        while not s.is_eos:
            # Emit start token
            token = s.scan(r'<\/?[^\s>]+')
            if token:
                tokens.append(token + '>')
                continue

            # Emit attributes with trailing =
            token = s.scan(r'\w+=')
            if token:
                tokens.append(token)

                # Then skip over attribute value
                if s.scan('"'):
                    s.skip_until(r'[^\\]"')
                    continue
                if s.scan("'"):
                    s.skip_until(r"[^\\]'")
                    continue
                s.skip_until(r'\w+')
                continue

            # Emit lone attributes
            token = s.scan(r'\w+')
            if token:
                tokens.append(token)

            # Stop at the end of the tag
            if s.scan('>'):
                s.terminate
                continue

            s.getch

        return tokens
Esempio n. 17
0
    def extract_tokens(self, data):
        """
        Internal: Extract generic tokens from data.

        data - String to scan.

        Examples

          extract_tokens("printf('Hello')")
          # => ['printf', '(', ')']

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []
        while not s.is_eos:
            if s.pos >= BYTE_LIMIT: break
            token = s.scan(r'^#!.+')
            if token:
                name = self.extract_shebang(token)
                if name:
                    tokens.append('SHEBANG#!%s' % name)
                    continue

            # Single line comment
            if s.is_beginning_of_line and s.scan(START_SINGLE_LINE_COMMENT):
                s.skip_until(r'\n|\Z')
                continue

            # Multiline comments
            token = s.scan(START_MULTI_LINE_COMMENT)
            if token:
                close_token = dict(MULTI_LINE_COMMENTS).get(token)
                s.skip_until(re.compile(re.escape(close_token)))
                continue

            # Skip single or double quoted strings
            if s.scan(r'"'):
                if s.peek(1) == '"':
                    s.getch
                else:
                    s.skip_until(r'[^\\]"')
            if s.scan(r"'"):
                if s.peek(1) == "'":
                    s.getch
                else:
                    s.skip_until(r"[^\\]'")

            # Skip number literals
            if s.scan(r'(0x)?\d(\d|\.)*'):
                continue

            # SGML style brackets
            token = s.scan(r'<[^\s<>][^<>]*>')
            if token:
                for t in self.extract_sgml_tokens(token):
                    tokens.append(t)
                continue

            # Common programming punctuation
            token = s.scan(r';|\{|\}|\(|\)|\[|\]')
            if token:
                tokens.append(token)
                continue

            # Regular token
            token = s.scan(r'[\w\.@#\/\*]+')
            if token:
                tokens.append(token)
                continue

            # Common operators
            token = s.scan(r'<<?|\+|\-|\*|\/|%|&&?|\|\|?')
            if token:
                tokens.append(token)
                continue

            s.getch
        return tokens
Esempio n. 18
0
def scanner():
    return StringScanner("bar foobar")