Example #1
0
    def simple_tokenizer(self, expression):
        """
        Return an iterable of Token describing each token given an expression
        unicode string.

        The split is done on spaces, keywords and parens. Anything else is a
        symbol token, e.g. a typically license key or license id (that contains
        no spaces or parens).

        If symbols were provided when this Licensing object was created, the
        tokenizer will recognize known symbol keys (ignoring case) when
        tokenizing expressions.
        """

        symbols = self.known_symbols_lowercase or {}

        for match in _simple_tokenizer(expression):
            if not match:
                continue
            # set start and end as string indexes
            start, end = match.span()
            end = end - 1
            match_getter = match.groupdict().get

            space = match_getter('space')
            if space:
                yield Token(start, end, space, None)

            lpar = match_getter('lpar')
            if lpar:
                yield Token(start, end, lpar, KW_LPAR)

            rpar = match_getter('rpar')
            if rpar:
                yield Token(start, end, rpar, KW_RPAR)

            sym_or_op = match_getter('symop')
            if sym_or_op:
                sym_or_op_lower = sym_or_op.lower()

                operator = OPERATORS.get(sym_or_op_lower)
                if operator:
                    yield Token(start, end, sym_or_op, operator)
                else:
                    sym = symbols.get(sym_or_op_lower)
                    if not sym:
                        sym = LicenseSymbol(key=sym_or_op)
                    yield Token(start, end, sym_or_op, sym)
Example #2
0
    def build_token_with_symbol():
        """
        Build and return a new Token from accumulated unmatched tokens or None.
        """
        if not unmatched:
            return
        # strip trailing spaces
        trailing_spaces = []
        while unmatched and not unmatched[-1].string.strip():
            trailing_spaces.append(unmatched.pop())

        if unmatched:
            string = ' '.join(t.string for t in unmatched if t.string.strip())
            start = unmatched[0].start
            end = unmatched[-1].end
            toksym = LicenseSymbol(string)
            unmatched.clear()
            yield Token(start, end, string, toksym)

        for ts in trailing_spaces:
            yield ts
    def test_iter_simple(self):
        t = Trie()
        t.add('AND', 'AND')
        t.add('OR', 'OR')
        t.add('WITH', 'WITH')
        t.add('(', '(')
        t.add(')', ')')
        t.add('GPL-2.0', 'GPL-2.0')
        t.add('mit', 'MIT')
        t.add('Classpath', 'Classpath')
        t.make_automaton()
        test_string = '(GPL-2.0 with Classpath) or (gpl-2.0) and (classpath or  gpl-2.0 OR mit) '
        #                        111111111122222222223333333333444444444455555555556666666666777
        #              0123456789012345678901234567890123456789012345678901234567890123456789012
        result = list(t.iter(test_string))
        expected = [
            Token(0, 0, u'(', u'('),
            Token(1, 7, u'GPL-2.0', u'GPL-2.0'),
            Token(9, 12, u'with', u'WITH'),
            Token(14, 22, u'Classpath', u'Classpath'),
            Token(23, 23, u')', u')'),
            Token(25, 26, u'or', u'OR'),
            Token(28, 28, u'(', u'('),
            Token(29, 35, u'gpl-2.0', u'GPL-2.0'),
            Token(36, 36, u')', u')'),
            Token(38, 40, u'and', u'AND'),
            Token(42, 42, u'(', u'('),
            Token(43, 51, u'classpath', u'Classpath'),
            Token(53, 54, u'or', u'OR'),
            Token(57, 63, u'gpl-2.0', u'GPL-2.0'),
            Token(65, 66, u'OR', u'OR'),
            Token(68, 70, u'mit', u'MIT'),
            Token(71, 71, u')', u')')
        ]

        assert expected == result
    def test_tokenize_with_unmatched_and_space(self):
        def get_test_automaton():
            words = '( AND ) OR'.split()
            t = Trie()
            for w in words:
                t.add(w, w)
            t.make_automaton()
            return t

        test_string = '((l-a + AND l-b) OR an (l -c+))'
        #                        111111111122222222223
        #              0123456789012345678901234567890
        t = get_test_automaton()
        result = list(
            t.tokenize(test_string, include_unmatched=True,
                       include_space=True))
        expected = [
            Token(0, 0, u'(', u'('),
            Token(1, 1, u'(', u'('),
            Token(2, 4, u'l-a', None),
            Token(5, 5, u' ', None),
            Token(6, 6, u'+', None),
            Token(7, 7, u' ', None),
            Token(8, 10, u'AND', u'AND'),
            Token(11, 11, u' ', None),
            Token(12, 14, u'l-b', None),
            Token(15, 15, u')', u')'),
            Token(16, 16, u' ', None),
            Token(17, 18, u'OR', u'OR'),
            Token(19, 19, u' ', None),
            Token(20, 21, u'an', None),
            Token(22, 22, u' ', None),
            Token(23, 23, u'(', u'('),
            Token(24, 24, u'l', None),
            Token(25, 25, u' ', None),
            Token(26, 28, u'-c+', None),
            Token(29, 29, u')', u')'),
            Token(30, 30, u')', u')')
        ]

        assert expected == result
        assert test_string == ''.join(t.string for t in result)
    def test_iter_vs_tokenize(self):
        def get_test_automaton():
            words = '( AND ) OR'.split()
            t = Trie()
            for w in words:
                t.add(w, w)
            t.make_automaton()
            return t

        test_string = '((l-a + AND l-b) OR (l -c+))'

        t = get_test_automaton()
        result = list(
            t.iter(test_string, include_unmatched=True, include_space=True))
        expected = [
            Token(0, 0, u'(', u'('),
            Token(1, 1, u'(', u'('),
            Token(2, 4, u'l-a', None),
            Token(5, 5, u' ', None),
            Token(6, 6, u'+', None),
            Token(7, 7, u' ', None),
            Token(8, 10, u'AND', u'AND'),
            Token(11, 11, u' ', None),
            Token(12, 14, u'l-b', None),
            Token(15, 15, u')', u')'),
            Token(16, 16, u' ', None),
            Token(17, 18, u'OR', u'OR'),
            Token(19, 19, u' ', None),
            Token(20, 20, u'(', u'('),
            Token(21, 21, u'l', None),
            Token(22, 22, u' ', None),
            Token(23, 25, u'-c+', None),
            Token(26, 26, u')', u')'),
            Token(27, 27, u')', u')')
        ]

        assert expected == result

        result = list(
            t.tokenize(test_string, include_unmatched=True,
                       include_space=True))
        assert expected == result
    def test_iter_should_can_return_non_matches_optionally(self):
        def get_test_automaton():
            words = 'he her hers his she hi him man himan'.split()
            t = Trie()
            for w in words:
                t.add(w, w)
            t.make_automaton()
            return t

        test_string = '  he she junk  himan  other stuffs   '
        #                        111111111122222222223333333
        #              0123456789012345678901234567890123456

        t = get_test_automaton()
        result = list(
            t.iter(test_string, include_unmatched=True, include_space=True))
        expected = [
            Token(0, 1, u'  ', None),
            Token(2, 3, u'he', u'he'),
            Token(4, 4, u' ', None),
            Token(5, 7, u'she', u'she'),
            Token(8, 8, u' ', None),
            Token(9, 12, u'junk', None),
            Token(13, 14, u'  ', None),
            Token(15, 19, u'himan', u'himan'),
            Token(20, 21, u'  ', None),
            Token(22, 26, u'other', None),
            Token(27, 27, u' ', None),
            Token(28, 33, u'stuffs', None),
            Token(34, 36, u'   ', None),
        ]

        assert expected == result
Example #7
0
def replace_with_subexpression_by_license_symbol(tokens, strict=False):
    """
    Given an iterable of Token, yiled token, replacing any XXX WITH ZZZ
    subexpression by a LicenseWithExceptionSymbol symbol.

    Check validity of with subexpessions and raise ParseError as needed.

    If `strict` is True also raise ParseError if the left hand side
    LicenseSymbol has is_exception True or if the right hand side
    LicenseSymbol has is_exception False.
    """
    token_groups = build_token_groups_for_with_subexpression(tokens)

    for token_group in token_groups:
        len_group = len(token_group)

        if not len_group:
            # This should never happen
            continue

        if len_group == 1:
            # a single token
            token = token_group[0]
            tval = token.value

            if isinstance(tval, Keyword):
                if tval.type == TOKEN_WITH:
                    # keyword
                    # a single group cannot be a single 'WITH' keyword:
                    # this is an error that we catch and raise here.
                    raise ParseError(token_type=TOKEN_WITH,
                                     token_string=token.string,
                                     position=token.start,
                                     error_code=PARSE_INVALID_EXPRESSION)

            elif isinstance(tval, LicenseSymbol):
                if strict and tval.is_exception:
                    raise ParseError(token_type=TOKEN_SYMBOL,
                                     token_string=token.string,
                                     position=token.start,
                                     error_code=PARSE_INVALID_EXCEPTION)

            else:
                # this should not be possible by design
                raise Exception(
                    'Licensing.tokenize is internally confused...:' +
                    repr(tval))

            yield token
            continue

        if len_group != 3:
            # this should never happen
            string = ' '.join([tok.string for tok in token_group])
            start = token_group[0].start
            raise ParseError(TOKEN_SYMBOL, string, start,
                             PARSE_INVALID_EXPRESSION)

        # from now on we have a tripple of tokens: a WITH sub-expression such as "A with
        # B" seq of three tokens
        lic_token, WITH, exc_token = token_group

        token_string = ' '.join(
            [lic_token.string,
             WITH.string.strip(), exc_token.string])

        # the left hand side license symbol
        lic_sym = lic_token.value

        # this should not happen
        if not isinstance(lic_sym, LicenseSymbol):
            raise ParseError(TOKEN_SYMBOL, lic_token.string, lic_token.start,
                             PARSE_INVALID_SYMBOL)

        if strict and lic_sym.is_exception:
            raise ParseError(TOKEN_SYMBOL, lic_token.string, lic_token.start,
                             PARSE_INVALID_EXCEPTION)

        # the right hand side exception symbol
        exc_sym = exc_token.value

        if not isinstance(exc_sym, LicenseSymbol):
            raise ParseError(TOKEN_SYMBOL, lic_sym.string, lic_sym.start,
                             PARSE_INVALID_SYMBOL)

        if strict and not exc_sym.is_exception:
            raise ParseError(TOKEN_SYMBOL, exc_token.string, exc_token.start,
                             PARSE_INVALID_SYMBOL_AS_EXCEPTION)

        lic_exc_sym = LicenseWithExceptionSymbol(lic_sym, exc_sym, strict)

        token = Token(
            lic_token.start,
            exc_token.end,
            token_string,
            lic_exc_sym,
        )
        yield token