def test_matched_query_text_tokenizer_works_with_spdx_ids(self):
        text = u''' * SPDX-License-Identifier: GPL-2.0+    BSD-3-Clause
         * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
        '''
        result = list(matched_query_text_tokenizer(text))
        expected = [(False, u' * '), (True, u'SPDX'), (False, u'-'),
                    (True, u'License'), (False, u'-'), (True, u'Identifier'),
                    (False, u': '),
                    (True, u'GPL'), (False, u'-'), (True, u'2'), (False, u'.'),
                    (True, u'0+'), (False, u'    '), (True, u'BSD'),
                    (False, u'-'), (True, u'3'), (False, u'-'),
                    (True, u'Clause'), (False, u'\n         * '),
                    (True, u'SPDX'), (False, u'-'), (True, u'License'),
                    (False, u'-'), (True, u'Identifier'), (False, u': ('),
                    (True, u'BSD'), (False, u'-'), (True, u'3'), (False, u'-'),
                    (True, u'Clause'), (False, u' '), (True, u'OR'),
                    (False, u' '), (True, u'EPL'), (False, u'-'), (True, u'1'),
                    (False, u'.'), (True, u'0'), (False, u' '), (True, u'OR'),
                    (False, u' '), (True, u'Apache'), (False, u'-'),
                    (True, u'2'), (False, u'.'), (True, u'0'), (False, u' '),
                    (True, u'OR'), (False, u' '), (True, u'MIT'),
                    (False, u')\n        ')]

        assert result == expected

        result_as_text = u''.join(
            itertools.chain.from_iterable(
                [v for v in m.groupdict().values() if v]
                for m in tokens_and_non_tokens(text)))
        assert result_as_text == text
    def test_matched_query_text_tokenizer_and_query_tokenizer_should_yield_the_same_texts(
            self):
        text = u'''Redistribution+ ;and use in! + 2003 source and +binary forms,
        ()with or without modifi+cation, are permitted with İrəli .\t\n
        \r'''

        mqtt_result = [
            t for is_tok, t in matched_query_text_tokenizer(text) if is_tok
        ]
        qt_result = list(query_tokenizer(text))
        mqtt_expected = [
            'Redistribution+',
            'and',
            'use',
            'in',
            '2003',
            'source',
            'and',
            'binary',
            'forms',
            'with',
            'or',
            'without',
            'modifi+cation',
            'are',
            'permitted',
            'with',
            'İrəli',
        ]

        qt_expected = [
            'redistribution+',
            'and',
            'use',
            'in',
            '2003',
            'source',
            'and',
            'binary',
            'forms',
            'with',
            'or',
            'without',
            'modifi+cation',
            'are',
            'permitted',
            'with',
            # this is NOT the same as above...
            # See https://github.com/nexB/scancode-toolkit/issues/1872
            'i',
            'rəli'
        ]
        assert mqtt_expected == mqtt_result
        assert qt_expected == qt_result
    def test_tokenizers_regex_do_not_choke_on_some_text(self):
        # somehow this text was making the regex choke.
        tf = self.get_test_loc('tokenize/parser.js')
        with io.open(tf, encoding='utf-8') as text:
            content = text.read()

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(matched_query_text_tokenizer(content))
        duration = time() - start
        assert duration < 5
    def test_tokenizers_regex_do_not_choke_on_some_text(self):
        # somehow this text was making the regex choke.
        tf = self.get_test_loc('tokenize/parser.js')
        with codecs.open(tf, 'rb', encoding='utf-8') as text:
            content = text.read()

        start = time()
        list(rule_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(matched_query_text_tokenizer(content))
        duration = time() - start
        assert duration < 5
Beispiel #5
0
    def matched_query_text_tokenizer_yield_properly_all_texts(self):
        text = u'''Redistribution+ ;and use in! 2003 source and binary forms, 
        ()with or without modification, are permitted.\t\n
        \r'''
        result = list(matched_query_text_tokenizer(text))
        expected = [(True, u'Redistribution+'), (False, u' ;'), (True, u'and'),
                    (False, u' '), (True, u'use'),
                    (False, u' '), (True, u'in'), (False, u'! '),
                    (True, u'2003'), (False, u' '), (True, u'source'),
                    (False, u' '), (True, u'and'), (False, u' '),
                    (True, u'binary'), (False, u' '), (True, u'forms'),
                    (False, u', \n        ()'), (True, u'with'), (False, u' '),
                    (True, u'or'), (False, u' '), (True, u'without'),
                    (False, u' '), (True, u'modification'), (False, u', '),
                    (True, u'are'), (False, u' '), (True, u'permitted'),
                    (False, u'.\t\n\n        \r')]
        assert expected == result

        result_as_text = u''.join(v for _t, v in result)
        assert text == result_as_text
    def matched_query_text_tokenizer_yield_properly_all_texts(self):
        text = u'''Redistribution+ ;and use in! 2003 source and binary forms, 
        ()with or without modification, are permitted.\t\n
        \r'''
        result = list(matched_query_text_tokenizer(text))
        expected = [
            (True, u'Redistribution+'),
            (False, u' ;'),
            (True, u'and'),
            (False, u' '),
            (True, u'use'),
            (False, u' '),
            (True, u'in'),
            (False, u'! '),
            (True, u'2003'),
            (False, u' '),
            (True, u'source'),
            (False, u' '),
            (True, u'and'),
            (False, u' '),
            (True, u'binary'),
            (False, u' '),
            (True, u'forms'),
            (False, u', \n        ()'),
            (True, u'with'),
            (False, u' '),
            (True, u'or'),
            (False, u' '),
            (True, u'without'),
            (False, u' '),
            (True, u'modification'),
            (False, u', '),
            (True, u'are'),
            (False, u' '),
            (True, u'permitted'),
            (False, u'.\t\n\n        \r')
        ]
        assert expected == result

        result_as_text = u''.join(v for _t, v in result)
        assert text == result_as_text