def test_matched_query_text_tokenizer_works_with_spdx_ids(self): text = u''' * SPDX-License-Identifier: GPL-2.0+ BSD-3-Clause * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) ''' result = list(matched_query_text_tokenizer(text)) expected = [(False, u' * '), (True, u'SPDX'), (False, u'-'), (True, u'License'), (False, u'-'), (True, u'Identifier'), (False, u': '), (True, u'GPL'), (False, u'-'), (True, u'2'), (False, u'.'), (True, u'0+'), (False, u' '), (True, u'BSD'), (False, u'-'), (True, u'3'), (False, u'-'), (True, u'Clause'), (False, u'\n * '), (True, u'SPDX'), (False, u'-'), (True, u'License'), (False, u'-'), (True, u'Identifier'), (False, u': ('), (True, u'BSD'), (False, u'-'), (True, u'3'), (False, u'-'), (True, u'Clause'), (False, u' '), (True, u'OR'), (False, u' '), (True, u'EPL'), (False, u'-'), (True, u'1'), (False, u'.'), (True, u'0'), (False, u' '), (True, u'OR'), (False, u' '), (True, u'Apache'), (False, u'-'), (True, u'2'), (False, u'.'), (True, u'0'), (False, u' '), (True, u'OR'), (False, u' '), (True, u'MIT'), (False, u')\n ')] assert result == expected result_as_text = u''.join( itertools.chain.from_iterable( [v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text))) assert result_as_text == text
def test_matched_query_text_tokenizer_and_query_tokenizer_should_yield_the_same_texts( self): text = u'''Redistribution+ ;and use in! + 2003 source and +binary forms, ()with or without modifi+cation, are permitted with İrəli .\t\n \r''' mqtt_result = [ t for is_tok, t in matched_query_text_tokenizer(text) if is_tok ] qt_result = list(query_tokenizer(text)) mqtt_expected = [ 'Redistribution+', 'and', 'use', 'in', '2003', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modifi+cation', 'are', 'permitted', 'with', 'İrəli', ] qt_expected = [ 'redistribution+', 'and', 'use', 'in', '2003', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modifi+cation', 'are', 'permitted', 'with', # this is NOT the same as above... # See https://github.com/nexB/scancode-toolkit/issues/1872 'i', 'rəli' ] assert mqtt_expected == mqtt_result assert qt_expected == qt_result
def test_tokenizers_regex_do_not_choke_on_some_text(self): # somehow this text was making the regex choke. tf = self.get_test_loc('tokenize/parser.js') with io.open(tf, encoding='utf-8') as text: content = text.read() start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(matched_query_text_tokenizer(content)) duration = time() - start assert duration < 5
def test_tokenizers_regex_do_not_choke_on_some_text(self): # somehow this text was making the regex choke. tf = self.get_test_loc('tokenize/parser.js') with codecs.open(tf, 'rb', encoding='utf-8') as text: content = text.read() start = time() list(rule_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(matched_query_text_tokenizer(content)) duration = time() - start assert duration < 5
def matched_query_text_tokenizer_yield_properly_all_texts(self): text = u'''Redistribution+ ;and use in! 2003 source and binary forms, ()with or without modification, are permitted.\t\n \r''' result = list(matched_query_text_tokenizer(text)) expected = [(True, u'Redistribution+'), (False, u' ;'), (True, u'and'), (False, u' '), (True, u'use'), (False, u' '), (True, u'in'), (False, u'! '), (True, u'2003'), (False, u' '), (True, u'source'), (False, u' '), (True, u'and'), (False, u' '), (True, u'binary'), (False, u' '), (True, u'forms'), (False, u', \n ()'), (True, u'with'), (False, u' '), (True, u'or'), (False, u' '), (True, u'without'), (False, u' '), (True, u'modification'), (False, u', '), (True, u'are'), (False, u' '), (True, u'permitted'), (False, u'.\t\n\n \r')] assert expected == result result_as_text = u''.join(v for _t, v in result) assert text == result_as_text
def matched_query_text_tokenizer_yield_properly_all_texts(self): text = u'''Redistribution+ ;and use in! 2003 source and binary forms, ()with or without modification, are permitted.\t\n \r''' result = list(matched_query_text_tokenizer(text)) expected = [ (True, u'Redistribution+'), (False, u' ;'), (True, u'and'), (False, u' '), (True, u'use'), (False, u' '), (True, u'in'), (False, u'! '), (True, u'2003'), (False, u' '), (True, u'source'), (False, u' '), (True, u'and'), (False, u' '), (True, u'binary'), (False, u' '), (True, u'forms'), (False, u', \n ()'), (True, u'with'), (False, u' '), (True, u'or'), (False, u' '), (True, u'without'), (False, u' '), (True, u'modification'), (False, u', '), (True, u'are'), (False, u' '), (True, u'permitted'), (False, u'.\t\n\n \r') ] assert expected == result result_as_text = u''.join(v for _t, v in result) assert text == result_as_text