Beispiel #1
0
class BstParser(Scanner):
    LBRACE = Literal(u'{')
    RBRACE = Literal(u'}')
    STRING = Pattern(ur'"[^\"]*"', 'string')
    INTEGER = Pattern(ur'#-?\d+', 'integer')
    NAME = Pattern(ur'[^#\"\{\}\s]+', 'name')

    COMMANDS = {
        'ENTRY': 3,
        'EXECUTE': 1,
        'FUNCTION': 2,
        'INTEGERS': 1,
        'ITERATE': 1,
        'MACRO': 2,
        'READ': 0,
        'REVERSE': 1,
        'SORT': 0,
        'STRINGS': 1,
    }

    LITERAL_TYPES = {
        STRING: process_string_literal,
        INTEGER: process_int_literal,
        NAME: process_identifier,
    }

    def parse(self):
        while True:
            try:
                yield list(self.parse_command())
            except EOFError:
                break
            except PybtexSyntaxError, e:
                raise
                break
Beispiel #2
0
class BstParser(Scanner):
    LBRACE = Literal('{')
    RBRACE = Literal('}')
    STRING = Pattern('"[^\"]*"', 'string')
    INTEGER = Pattern(r'#-?\d+', 'integer')
    NAME = Pattern(r'[^#\"\{\}\s]+', 'name')

    COMMANDS = {
        'ENTRY': 3,
        'EXECUTE': 1,
        'FUNCTION': 2,
        'INTEGERS': 1,
        'ITERATE': 1,
        'MACRO': 2,
        'READ': 0,
        'REVERSE': 1,
        'SORT': 0,
        'STRINGS': 1,
    }

    LITERAL_TYPES = {
        STRING: process_string_literal,
        INTEGER: process_int_literal,
        NAME: process_identifier,
    }

    def parse(self):
        while True:
            try:
                yield list(self.parse_command())
            except EOFError:
                break
            except PybtexSyntaxError:
                raise
                break

    def parse_group(self):
        while True:
            token = self.required([self.NAME, self.STRING, self.INTEGER, self.LBRACE, self.RBRACE])
            if token.pattern is self.LBRACE:
                yield FunctionLiteral(list(self.parse_group()))
            elif token.pattern is self.RBRACE:
                break
            else:
                yield self.LITERAL_TYPES[token.pattern](token.value)

    def parse_command(self):
        command_name = self.required([self.NAME], 'BST command', allow_eof=True).value
        try:
            arity = self.COMMANDS[command_name.upper()]
        except KeyError:
            raise TokenRequired('BST command', self)
        yield command_name
        for i in range(arity):
            brace = self.optional([self.LBRACE])
            if not brace:
                break
            yield list(self.parse_group())
Beispiel #3
0
class LaTeXParser(Scanner):
    LBRACE = Literal(u'{')
    RBRACE = Literal(u'}')

    def parse(self, level=0):
        """
        >>> from nose.tools import assert_raises

        >>> LaTeXParser('abc').parse()
        Text('abc')

        >>> LaTeXParser('abc{def}').parse()
        Text('abc', Protected('def'))

        >>> LaTeXParser('abc{def {xyz}} !').parse()
        Text('abc', Protected('def ', Protected('xyz')), ' !')

        >>> assert_raises(PybtexSyntaxError, LaTeXParser('abc{def}}').parse)
        >>> assert_raises(PybtexSyntaxError, LaTeXParser('abc{def}{').parse)
        """

        return Text(*self.iter_string_parts(level=level))

    def iter_string_parts(self, level=0):
        while True:
            token = self.skip_to([self.LBRACE, self.RBRACE])
            if not token:
                remainder = self.get_remainder()
                if remainder:
                    yield String(remainder)
                if level != 0:
                    raise PybtexSyntaxError('unbalanced braces', self)
                break
            elif token.pattern is self.LBRACE:
                yield String(token.value[:-1])
                yield Protected(*self.iter_string_parts(level=level + 1))
            else:  # brace.pattern is self.RBRACE
                yield String(token.value[:-1])
                if level == 0:
                    raise PybtexSyntaxError('unbalanced braces', self)
                break
Beispiel #4
0
class LaTeXParser(Scanner):
    LBRACE = Literal(u'{')
    RBRACE = Literal(u'}')
    DOLLAR = Literal(u'$')

    def parse(self, level=0):
        return Text(*self.iter_string_parts(level=level))

    def iter_string_parts(self, level=0, in_math=False):
        while True:
            # there will be no Protected inside Math,
            # since we need to preserve all braces
            if in_math:
                token = self.skip_to([self.DOLLAR])
            else:
                token = self.skip_to([self.LBRACE, self.RBRACE, self.DOLLAR])
            if not token:
                remainder = self.get_remainder()
                if remainder:
                    yield String(remainder)
                if level != 0:
                    raise PybtexSyntaxError('unbalanced braces', self)
                break
            elif token.pattern is self.DOLLAR:
                if in_math:
                    yield String(token.value[:-1])
                    if level == 0:
                        raise PybtexSyntaxError('unbalanced math', self)
                    break
                else:
                    yield String(token.value[:-1])
                    yield Math(
                        *self.iter_string_parts(level=level + 1, in_math=True))
            elif token.pattern is self.LBRACE:
                yield String(token.value[:-1])
                yield Protected(*self.iter_string_parts(level=level + 1))
            else:  # brace.pattern is self.RBRACE
                yield String(token.value[:-1])
                if level == 0:
                    raise PybtexSyntaxError('unbalanced braces', self)
                break
Beispiel #5
0
class NameFormatParser(Scanner):
    LBRACE = Literal(u'{')
    RBRACE = Literal(u'}')
    TEXT = Pattern(r'[^{}]+', 'text')
    NON_LETTERS = Pattern(r'[^{}\w]|\d+', 'non-letter characters', flags=re.IGNORECASE | re.UNICODE)
    FORMAT_CHARS = Pattern(r'[^\W\d_]+', 'format chars', flags=re.IGNORECASE | re.UNICODE)

    lineno = None

    def parse(self):
        while True:
            try:
                result = self.parse_toplevel()
                yield result
            except EOFError:
                break
            
    def parse_toplevel(self):
        token = self.required([self.TEXT, self.LBRACE, self.RBRACE], allow_eof=True)
        if token.pattern is self.TEXT:
            return Text(token.value)
        elif token.pattern is self.LBRACE:
            return NamePart(self.parse_name_part())
        elif token.pattern is self.RBRACE:
            raise UnbalancedBraceError(self)

    def parse_braced_string(self):
        while True:
            try:
                token = self.required([self.TEXT, self.RBRACE, self.LBRACE]) 
            except PrematureEOF:
                raise UnbalancedBraceError(self)
            if token.pattern is self.TEXT:
                yield token.value
            elif token.pattern is self.RBRACE:
                break
            elif token.pattern is self.LBRACE:
                yield u'{{{0}}}'.format(''.join(self.parse_braced_string()))
            else:
                raise ValueError(token)

    def parse_name_part(self):
        verbatim_prefix = []
        format_chars = None
        verbatim_postfix = []
        verbatim = verbatim_prefix
        delimiter = None

        def check_format_chars(value):
            value = value.lower()
            if (
                format_chars is not None
                or len(value) not in [1, 2]
                or value[0] != value[-1]
                or value[0] not in 'flvj'
            ):
                raise PybtexSyntaxError(u'name format string "{0}" has illegal brace-level-1 letters: {1}'.format(self.text, token.value), self)

        while True:
            try:
                token = self.required([self.LBRACE, self.NON_LETTERS, self.FORMAT_CHARS, self.RBRACE])
            except PrematureEOF:
                raise UnbalancedBraceError(self)

            if token.pattern is self.LBRACE:
                verbatim.append(u'{{{0}}}'.format(''.join(self.parse_braced_string())))
            elif token.pattern is self.FORMAT_CHARS:
                check_format_chars(token.value)
                format_chars = token.value
                verbatim = verbatim_postfix
                if self.optional([self.LBRACE]):
                    delimiter = ''.join(self.parse_braced_string())
            elif token.pattern is self.NON_LETTERS:
                verbatim.append(token.value)
            elif token.pattern is self.RBRACE:
                return ''.join(verbatim_prefix), format_chars, delimiter, ''.join(verbatim_postfix)
            else:
                raise ValueError(token)

    def eat_whitespace(self):
        pass
Beispiel #6
0
class LowLevelParser(Scanner):
    NAME = Pattern(
        r'[{0}][{1}]*'.format(re.escape(NAME_CHARS),
                              re.escape(NAME_CHARS + digits)), 'a valid name')
    KEY_PAREN = Pattern(r'[^\s\,]+', 'entry key')
    KEY_BRACE = Pattern(r'[^\s\,}]+', 'entry key')
    NUMBER = Pattern(r'[{0}]+'.format(digits), 'a number')
    LBRACE = Literal(u'{')
    RBRACE = Literal(u'}')
    LPAREN = Literal(u'(')
    RPAREN = Literal(u')')
    QUOTE = Literal(u'"')
    COMMA = Literal(u',')
    EQUALS = Literal(u'=')
    HASH = Literal(u'#')
    AT = Literal(u'@')

    command_start = None
    current_command = None
    current_entry_key = None
    current_fields = None
    current_field_name = None
    current_field_value = None

    def __init__(self,
                 text,
                 keyless_entries=False,
                 macros=month_names,
                 handle_error=None,
                 want_entry=None,
                 filename=None):
        super(LowLevelParser, self).__init__(text, filename)
        self.keyless_entries = keyless_entries
        self.macros = macros
        if handle_error:
            self.handle_error = handle_error
        if want_entry:
            self.want_entry = want_entry

    def __iter__(self):
        return self.parse_bibliography()

    def get_error_context_info(self):
        return self.command_start, self.lineno, self.pos

    def get_error_context(self, context_info):
        error_start, lineno, error_pos = context_info
        before_error = self.text[error_start:error_pos]
        if not before_error.endswith('\n'):
            eol = self.NEWLINE.search(self.text, error_pos)
            error_end = eol.end() if eol else self.end_pos
        else:
            error_end = error_pos
        context = self.text[error_start:error_end].rstrip('\r\n')
        colno = len(before_error.splitlines()[-1])
        return context, lineno, colno

    def handle_error(self, error):
        raise error

    def want_entry(self, key):
        return True

    def want_current_entry(self):
        return self.current_entry_key is None or self.want_entry(
            self.current_entry_key)

    def parse_bibliography(self):
        while True:
            if not self.skip_to([self.AT]):
                return
            self.command_start = self.pos - 1
            try:
                yield tuple(self.parse_command())
            except PybtexSyntaxError as error:
                self.handle_error(error)
            except SkipEntry:
                pass

    def parse_command(self):
        self.current_entry_key = None
        self.current_fields = []
        self.current_field_name = None
        self.current_value = []

        name = self.required([self.NAME])
        command = name.value
        body_start = self.required([self.LPAREN, self.LBRACE])
        body_end = self.RBRACE if body_start.pattern == self.LBRACE else self.RPAREN

        command_lower = command.lower()
        if command_lower == 'string':
            parse_body = self.parse_string_body
            make_result = lambda: (command, (self.current_field_name, self.
                                             current_value))
        elif command_lower == 'preamble':
            parse_body = self.parse_preamble_body
            make_result = lambda: (command, (self.current_value, ))
        elif command_lower == 'comment':
            raise SkipEntry
        else:
            parse_body = self.parse_entry_body
            make_result = lambda: (command, (self.current_entry_key, self.
                                             current_fields))
        try:
            parse_body(body_end)
            self.required([body_end])
        except PybtexSyntaxError as error:
            self.handle_error(error)
        return make_result()

    def parse_preamble_body(self, body_end):
        self.parse_value()

    def parse_string_body(self, body_end):
        self.current_field_name = self.required([self.NAME]).value
        self.required([self.EQUALS])
        self.parse_value()
        self.macros[self.current_field_name] = ''.join(self.current_value)

    def parse_entry_body(self, body_end):
        if not self.keyless_entries:
            key_pattern = self.KEY_PAREN if body_end == self.RPAREN else self.KEY_BRACE
            self.current_entry_key = self.required([key_pattern]).value
        self.parse_entry_fields()
        if not self.want_current_entry():
            raise SkipEntry

    def parse_entry_fields(self):
        while True:
            self.current_field_name = None
            self.current_value = []
            self.parse_field()
            if self.current_field_name and self.current_value:
                self.current_fields.append(
                    (self.current_field_name, self.current_value))
            comma = self.optional([self.COMMA])
            if not comma:
                return

    def parse_field(self):
        name = self.optional([self.NAME])
        if not name:
            return
        self.current_field_name = name.value
        self.required([self.EQUALS])
        self.parse_value()

    def parse_value(self):
        start = True
        concatenation = False
        value_parts = []
        while True:
            if not start:
                concatenation = self.optional([self.HASH])
            if not (start or concatenation):
                break
            value_parts.append(self.parse_value_part())
            start = False
        self.current_value = value_parts

    def parse_value_part(self):
        token = self.required(
            [self.QUOTE, self.LBRACE, self.NUMBER, self.NAME],
            description='field value',
        )
        if token.pattern is self.QUOTE:
            value_part = self.flatten_string(
                self.parse_string(string_end=self.QUOTE))
        elif token.pattern is self.LBRACE:
            value_part = self.flatten_string(
                self.parse_string(string_end=self.RBRACE))
        elif token.pattern is self.NUMBER:
            value_part = token.value
        else:
            value_part = self.substitute_macro(token.value)
        return value_part

    def flatten_string(self, parts):
        return ''.join(part.value for part in parts)[:-1]

    def substitute_macro(self, name):
        try:
            return self.macros[name]
        except KeyError:
            if self.want_current_entry():
                self.handle_error(UndefinedMacro(name, self))
            return ''

    def parse_string(self, string_end, level=0, max_level=100):
        if level > max_level:
            raise PybtexSyntaxError('too many nested braces', self)

        special_chars = [self.RBRACE, self.LBRACE]
        if string_end is self.QUOTE:
            special_chars = [self.QUOTE] + special_chars
        while True:
            part = self.skip_to(special_chars)
            if not part:
                raise PrematureEOF(self)
            if part.pattern is string_end:
                yield part
                break
            elif part.pattern is self.LBRACE:
                yield part
                for subpart in self.parse_string(self.RBRACE, level + 1):
                    yield subpart
            elif part.pattern is self.RBRACE and level == 0:
                raise PybtexSyntaxError('unbalanced braces', self)
Beispiel #7
0
class BibTeXEntryIterator(Scanner):
    NAME = Pattern(
        ur'[{0}][{1}]*'.format(re.escape(NAME_CHARS),
                               re.escape(NAME_CHARS + digits)), 'a valid name')
    KEY_PAREN = Pattern(ur'[^\s\,]+', 'entry key')
    KEY_BRACE = Pattern(ur'[^\s\,}]+', 'entry key')
    NUMBER = Pattern(ur'[{0}]+'.format(digits), 'a number')
    LBRACE = Literal(u'{')
    RBRACE = Literal(u'}')
    LPAREN = Literal(u'(')
    RPAREN = Literal(u')')
    QUOTE = Literal(u'"')
    COMMA = Literal(u',')
    EQUALS = Literal(u'=')
    HASH = Literal(u'#')
    AT = Literal(u'@')

    command_start = None
    current_command = None
    current_entry_key = None
    current_fields = None
    current_field_name = None
    current_field_value = None

    def __init__(self,
                 text,
                 keyless_entries=False,
                 macros=month_names,
                 handle_error=None,
                 want_entry=None,
                 filename=None):
        super(BibTeXEntryIterator, self).__init__(text, filename)
        self.keyless_entries = keyless_entries
        self.macros = macros
        if handle_error:
            self.handle_error = handle_error
        if want_entry:
            self.want_entry = want_entry

    def __iter__(self):
        return self.parse_bibliography()

    def get_error_context_info(self):
        return self.command_start, self.lineno, self.pos

    def get_error_context(self, context_info):
        error_start, lineno, error_pos = context_info
        before_error = self.text[error_start:error_pos]
        if not before_error.endswith('\n'):
            eol = self.NEWLINE.search(self.text, error_pos)
            error_end = eol.end() if eol else self.end_pos
        else:
            error_end = error_pos
        context = self.text[error_start:error_end].rstrip('\r\n')
        colno = len(before_error.splitlines()[-1])
        return context, lineno, colno

    def handle_error(self, error):
        raise error

    def want_entry(self, key):
        return True

    def want_current_entry(self):
        return self.current_entry_key is None or self.want_entry(
            self.current_entry_key)

    def parse_bibliography(self):
        while True:
            if not self.skip_to([self.AT]):
                return
            self.command_start = self.pos - 1
            try:
                yield tuple(self.parse_command())
            except PybtexSyntaxError as error:
                self.handle_error(error)
            except SkipEntry:
                pass

    def parse_command(self):
        self.current_entry_key = None
        self.current_fields = []
        self.current_field_name = None
        self.current_value = []

        name = self.required([self.NAME])
        command = name.value
        body_start = self.required([self.LPAREN, self.LBRACE])
        body_end = self.RBRACE if body_start.pattern == self.LBRACE else self.RPAREN

        command_lower = command.lower()
        if command_lower == 'string':
            parse_body = self.parse_string_body
            make_result = lambda: (command, (self.current_field_name, self.
                                             current_value))
        elif command_lower == 'preamble':
            parse_body = self.parse_preamble_body
            make_result = lambda: (command, (self.current_value, ))
        elif command_lower == 'comment':
            raise SkipEntry
        else:
            parse_body = self.parse_entry_body
            make_result = lambda: (command, (self.current_entry_key, self.
                                             current_fields))
        try:
            parse_body(body_end)
            self.required([body_end])
        except PybtexSyntaxError, error:
            self.handle_error(error)
        return make_result()