Esempio n. 1
0
class Expando(List):
    grammar = '[', attr('begin', re.compile(r'\d+')), ['-', ':'], \
        attr('end', re.compile(r'\d+')), ']'

    def _build(self, rr):
        for e in self:
            rr._expandos.append(e)
        return
Esempio n. 2
0
class LessEqualOp(UnaryRule):
    """Less than or Equal to operator.

    Supports queries like date <= 10-2000 or author-count 100-.
    """
    grammar = [
        (omit(Literal("<=")), attr('op', SimpleValue)),
        # Accept a number or numbers that are separated with (/ or -) followed by a "-" which should be
        # followed by \s or ) or end of input so that you don't accept a value like 1-e.
        (attr('op', re.compile(r"\d+([/-]\d+)*(?=-)")), omit(re.compile(r'-(?=\s|\)|$)'))),
    ]
class LessEqualOp(UnaryRule):
    """Less than or Equal to operator.

    Supports queries like date <= 10-2000 or author-count 100-.
    """
    grammar = [
        (omit(Literal("<=")), attr('op', SimpleValue)),
        # Accept a number or anything that doesn't contain {whitespace, (, ), :} followed by a "-" which should be
        # followed by \s or ) or end of input so that you don't accept a value that is 1-e.
        (attr('op', re.compile(r"\d+")), omit(re.compile(r'-(?=\s|\)|$)'))),
        (attr('op', re.compile(r"[^\s():]+(?=( -|-))")),
         omit(re.compile(r'\+(?=\s|\)|$)'))),
    ]
class And(CIKeyword):
    """
    The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether
    terminal symbols are actually DSL keywords.
    """
    regex = re.compile(r"(and|\+|&)", re.IGNORECASE)
    grammar = Enum(K("and"), K("+"), K("&"))
Esempio n. 5
0
class Not(CIKeyword):
    """
    The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether
    terminal symbols are actually DSL keywords.
    """
    regex = re.compile(r"(not|-)", re.IGNORECASE)
    grammar = Enum(K("not"), K("-"))
Esempio n. 6
0
class Operator(Keyword):
    """
    Operator in transaction output condition
    """

    grammar = Enum(K("&&"), K("||"), K("AND"), K("OR"))
    regex = re.compile(r"[&&|\|\||\w]+")

    @classmethod
    def token(cls: Type[OperatorType], keyword: str) -> OperatorType:
        """
        Return Operator instance from keyword

        :param keyword: Operator keyword in expression
        :return:
        """
        op = cls(keyword)
        return op

    def compose(
        self, parser: Any = None, grammar: Any = None, attr_of: str = None
    ) -> str:
        """
        Return the Operator keyword as string format

        :param parser: Parser instance
        :param grammar: Grammar
        :param attr_of: Attribute of...
        """
        return "{0}".format(self.name)
Esempio n. 7
0
class GreaterThanOp(UnaryRule):
    """Greater than operator.

    Supports queries like author-count > 2000 or date after 10-2000.
    """
    grammar = omit(re.compile(r"after|>",
                              re.IGNORECASE)), attr('op', SimpleValue)
Esempio n. 8
0
class LessThanOp(UnaryRule):
    """Less than operator.

    Supports queries like author-count < 100 or date before 1984.
    """
    grammar = omit(re.compile(r"before|<",
                              re.IGNORECASE)), attr('op', SimpleValue)
Esempio n. 9
0
class Or(CIKeyword):
    """
    The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether
    terminal symbols are actually DSL keywords.
    """
    regex = re.compile(r"(or|\|)", re.IGNORECASE)
    grammar = Enum(K("or"), K("|"))

    def __init__(self, *args):
        # Normalize different OR keywords (ignore the keyword argument that was passed).
        super(Or, self).__init__(BooleanOperator.OR)
Esempio n. 10
0
class Query(ListRule):
    """The entry-point for the grammar.

    Find keyword is ignored as the current grammar is an augmentation of SPIRES and Invenio style syntaxes.
    It only serves for backward compatibility with SPIRES syntax.
    """
    grammar = [
        (omit(optional(re.compile(r"(find|fin|fi|f)\s", re.IGNORECASE))),
         (Statement, maybe_some(MalformedQueryWords))),
        MalformedQueryWords,
        EmptyQuery,
    ]
class ComplexValue(LeafRule):
    """Accepting value with either single/double quotes or a regex value (/^.../$).

    These values have special and different meaning for the later phases of parsing:
      * Single quotes: partial text matching (text is analyzed before searched)
      * Double quotes: exact text matching
      * Regex: regex searches

    E.g. t 'Millisecond pulsar velocities'.

    This makes no difference for the parser and will be handled at a later parsing phase.
    """
    regex = re.compile(r"((/.+?/)|('.*?')|(\".*?\"))")
    grammar = attr('value', regex)
Esempio n. 12
0
class InvenioKeywordQuery(BinaryRule):
    """Keyword queries with colon separator (i.e. Invenio style).

    There needs to be a distinction between Invenio and SPIRES keyword queries, so as the parser is able to recognize
    any terminal as keyword for the former ones.

    Note:
        "arxiv:arxiv_identifier" should be excluded from the generic keyword pattern as it is a special case of
        SimpleValue, since it contains ":".
    E.g. author: ellis, title: boson, or unknown_keyword: foo.
    """
    grammar = attr('left', [InspireKeyword, re.compile(r"(?!arxiv)[^\s:]+")]), \
        omit(':'), \
        attr('right', Value)
Esempio n. 13
0
class InspireKeyword(LeafRule):
    # InspireKeyword expects a word boundary at its end, excluding [.,] characters, since these might signify names.
    grammar = re.compile(
        r"({0})(?![,.])(?=(:|\b))".format("|".join(
            INSPIRE_PARSER_KEYWORDS.keys())), re.IGNORECASE)

    def __init__(self, value):
        self.value = INSPIRE_PARSER_KEYWORDS[value.lower()]

    @classmethod
    def parse(cls, parser, text, pos):
        """Parse InspireKeyword.
        """
        try:
            remaining_text, keyword = parser.parse(text, cls.grammar)
            return remaining_text, InspireKeyword(keyword)
        except SyntaxError as e:
            return text, e
Esempio n. 14
0
class InspireKeyword(LeafRule):
    # InspireKeyword expects a word boundary at its end, excluding [.,] characters, since these might signify names.
    grammar = re.compile(r"({0})(?![,.])(?=(:|\b))".format("|".join(INSPIRE_PARSER_KEYWORDS.keys())), re.IGNORECASE)

    def __init__(self, value):
        self.value = INSPIRE_PARSER_KEYWORDS[value.lower()]

    @classmethod
    def parse(cls, parser, text, pos):
        """Parse InspireKeyword.

        If the keyword is `texkey`, enable the parsing texkey expression flag, since its value contains ':' which
        normally isn't allowed.
        """
        try:
            remaining_text, keyword = parser.parse(text, cls.grammar)
            if keyword.lower() == 'texkey':
                parser._parsing_texkey_expression = True
            return remaining_text, InspireKeyword(keyword)
        except SyntaxError as e:
            parser._parsing_texkey_expression = False
            return text, e
Esempio n. 15
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function

from pypeg2 import Symbol, Enum, List, K
from pypeg2 import attr, re, some, maybe_some, optional

Symbol.regex = re.compile(r'[\w\&\-]+')


class Operator(Symbol):
    grammar = Enum(K("&"), K("-"))

    def _build(self, rr):
        rr._ops.append(self[0])
        rr._nextop = self[0]
        return


class Expando(List):
    grammar = '[', attr('begin', re.compile(r'\d+')), ['-', ':'], \
        attr('end', re.compile(r'\d+')), ']'

    def _build(self, rr):
        for e in self:
            rr._expandos.append(e)
        return


class StringPart(str):
    grammar = attr('part', re.compile(r'[\-_a-z0-9\.]+'))
Esempio n. 16
0
class StringPart(str):
    grammar = attr('part', re.compile(r'[\-_a-z0-9\.]+'))

    def _build(self, rr):
        rr._strings.append(self[0])
        return
Esempio n. 17
0
class MalformedQueryWords(ListRule):
    """Represents queries that weren't recognized by the main parsing branch of Statements."""
    grammar = some(re.compile(r"[^\s]+", re.UNICODE))

    def __init__(self, children):
        self.children = children
Esempio n. 18
0
class Pubkey(str):
    """
    Pubkey in transaction output condition
    """

    regex = re.compile(PUBKEY_REGEX)
Esempio n. 19
0
class Pattern(str):
    grammar = re.compile(r'^\/.*\/$')

    def _build(self, rr):
        rr._patterns.append(self[0])
        return
Esempio n. 20
0
class SimpleValueWithColonUnit(SimpleValueUnit):
    token_regex = re.compile(r"[^\s)(]+[^\s:)(]", re.UNICODE)
Esempio n. 21
0
class SimpleRangeValue(LeafRule):
    grammar = attr('value', re.compile(r"([^\s)(-]|-+[^\s)(>])+"))
Esempio n. 22
0
class Hash(str):
    """
    Hash in transaction output condition
    """

    regex = re.compile(HASH_REGEX)
Esempio n. 23
0
class SimpleValueUnit(LeafRule):
    """Represents either a terminal symbol (without parentheses) or a parenthesized SimpleValue.

    The parenthesized case (2nd option of SimpleValueUnit) accepts a SimpleValue which is the more generic case of
    plaintext and in turn (its grammar) encapsulates whitespace and SimpleValueUnit recognition.

    """
    token_regex = re.compile(r"[^\s:)(]+", re.UNICODE)

    arxiv_token_regex = re.compile(r"(arxiv:)(" + token_regex.pattern + ")",
                                   re.IGNORECASE)
    """Arxiv identifiers are special cases of tokens where the ":" symbol is allowed."""

    date_specifiers_regex = re.compile(
        r"({})\s*-\s*\d+".format('|'.join(DATE_SPECIFIERS_COLLECTION)),
        re.UNICODE)

    parenthesized_token_grammar = None  # is set after SimpleValue definition.

    starts_with_colon = re.compile(r"\s*:", re.UNICODE)
    """Used for recognizing whether terminal token is a keyword (i.e. followed by some whitespace and ":"."""
    def __init__(self, args):
        super(SimpleValueUnit, self).__init__()
        if isinstance(args, six.string_types):
            # Value was recognized by the 1st option of the list grammar (regex)
            self.value = args
        else:
            # Value was recognized by the 2nd option of the list grammar
            self.value = args[0] + args[1].value + args[2]

    @classmethod
    def parse_terminal_token(cls, parser, text):
        """Parses a terminal token that doesn't contain parentheses nor colon symbol.

        Note:
            Handles a special case of tokens where a ':' is needed (for `texkey` queries).

            If we're parsing text not in parentheses, then some DSL keywords (e.g. And, Or, Not, defined above) should
            not be recognized as terminals, thus we check if they are in the Keywords table (namespace like structure
            handled by PyPeg).
            This is done only when we are not parsing a parenthesized SimpleValue.

            Also, helps in supporting more implicit-and queries cases (last two checks).
        """
        token_regex = cls.token_regex

        match = token_regex.match(text)
        if match:
            matched_token = match.group(0)

            # Check if token is a DSL keyword. Disable this check in the case where the parser isn't parsing a
            # parenthesized terminal.
            if not parser._parsing_parenthesized_terminal and matched_token.lower(
            ) in Keyword.table:
                return text, SyntaxError("found DSL keyword: " + matched_token)

            remaining_text = text[len(matched_token):]

            # Attempt to recognize whether current terminal is followed by a ":", which definitely signifies that
            # we are parsing a keyword, and we shouldn't.
            if cls.starts_with_colon.match(remaining_text):
                return text, \
                       SyntaxError("parsing a keyword (token followed by \":\"): \"" + repr(matched_token) + "\"")

            # Attempt to recognize whether current terminal is a non shortened version of Inspire keywords. This is
            # done for supporting implicit-and in case of SPIRES style keyword queries. Using the non shortened version
            # of the keywords, makes this recognition not eager.
            if not parser._parsing_parenthesized_simple_values_expression \
                    and matched_token in INSPIRE_KEYWORDS_SET:
                return text, SyntaxError(
                    "parsing a keyword (non shortened INSPIRE keyword)")

            result = remaining_text, matched_token
        else:
            result = text, SyntaxError("expecting match on " +
                                       repr(cls.token_regex.pattern))
        return result

    @classmethod
    def parse(cls, parser, text, pos):
        """Imitates parsing a list grammar.

        Specifically, this
        grammar = [
            SimpleValueUnit.date_specifiers_regex,
            SimpleValueUnit.arxiv_token_regex,
            SimpleValueUnit.token_regex,
            SimpleValueUnit.parenthesized_token_grammar
        ].

        Parses plaintext which matches date specifiers or arxiv_identifier syntax, or is comprised of either 1) simple
        terminal (no parentheses) or 2) a parenthesized SimpleValue.

        For example, "e(+)" will be parsed in two steps, first, "e" token will be recognized and then "(+)", as a
        parenthesized SimpleValue.
        """
        found = False

        # Attempt to parse date specifier
        match = cls.date_specifiers_regex.match(text)
        if match:
            remaining_text, token, found = text[len(match.group(0)
                                                    ):], match.group(0), True
        else:
            # Attempt to parse arxiv identifier
            match = cls.arxiv_token_regex.match(text)
            if match:
                remaining_text, token, found = text[len(match.group(
                )):], match.group(2), True
            else:
                # Attempt to parse a terminal token
                remaining_text, token = cls.parse_terminal_token(parser, text)
                if type(token) != SyntaxError:
                    found = True
                else:
                    # Attempt to parse a terminal with parentheses
                    try:
                        # Enable parsing a parenthesized terminal so that we can accept {+, -, |} as terminals.
                        parser._parsing_parenthesized_terminal = True
                        remaining_text, token = parser.parse(
                            text, cls.parenthesized_token_grammar, pos)

                        found = True
                    except SyntaxError:
                        pass
                    except GrammarValueError:
                        raise
                    except ValueError:
                        pass
                    finally:
                        parser._parsing_parenthesized_terminal = False

        if found:
            result = remaining_text, cls(token)
        else:
            result = text, SyntaxError("expecting match on " + cls.__name__)

        return result
Esempio n. 24
0
class Int(str):
    """
    Integer in transaction output condition
    """

    regex = re.compile(r"[0-9]+")
class InspireKeyword(LeafRule):
    grammar = re.compile(r"({0})(?=(:|\s))".format("|".join(
        INSPIRE_PARSER_KEYWORDS.keys())))

    def __init__(self, value):
        self.value = INSPIRE_PARSER_KEYWORDS[value]
Esempio n. 26
0
            else:
                result = text[len(match.group(0)):], cls(match.group(0))
        else:
            result = text, SyntaxError("expecting " + repr(cls.__name__))
        return result

    def __str__(self):
        return self.name

    def __repr__(self):
        return "%s()" % self.__class__.__name__


CIKeyword = CaseInsensitiveKeyword

u_word = re.compile("\w+", re.UNICODE)
# ########################


class BooleanOperator(object):
    """Serves as the possible case for a boolean operator."""
    AND = 'and'
    OR = 'or'


class LeafRule(ast.Leaf):
    def __init__(self, value=None):
        if value:
            super(LeafRule, self).__init__(value)

class MalformedQueryText(LeafRule):
    """Represents queries that weren't recognized by the main parsing branch of Statements."""
    grammar = some(re.compile(r"[^\s]+", re.UNICODE))

    def __init__(self, values):
        self.value = ' '.join([v for v in values])
Esempio n. 28
0
File: peg.py Progetto: nrh/pyrange
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function

from pypeg2 import Symbol, Enum, List, K
from pypeg2 import attr, re, some, maybe_some, optional

Symbol.regex = re.compile(r"[\w\&\-]+")


class Operator(Symbol):
    grammar = Enum(K("&"), K("-"))

    def _build(self, rr):
        rr._ops.append(self[0])
        rr._nextop = self[0]
        return


class Expando(List):
    grammar = "[", attr("begin", re.compile(r"\d+")), ["-", ":"], attr("end", re.compile(r"\d+")), "]"

    def _build(self, rr):
        for e in self:
            rr._expandos.append(e)
        return


class StringPart(str):
    grammar = attr("part", re.compile(r"[\-_a-z0-9\.]+"))

    def _build(self, rr):