def to_repr( prep_config: PrepConfig, token_list: List[ParsedToken], bpe_data: Optional[BpeData] = None ) -> Tuple[List[str], PreprocessingMetadata]: bpe_data = bpe_data or get_global_bpe_data_if_available() repr_list, metadata = to_repr_list(token_list, prep_config.get_repr_config(bpe_data)) if prep_config.is_bpe(): repr_list = insert_and_word_tokens(repr_list, metadata) return repr_list, metadata
def preprocess_corpus( path: str, prep_config: PrepConfig, bpe_codes_id: Optional[str] = None, extensions: Optional[str] = None, output_path: Optional[str] = None, calc_vocab: Optional[bool] = False) -> PreprocessedCorpus: output_path = output_path or os.getcwd() custom_bpe_config = None if prep_config.is_bpe(): assert bpe_codes_id if not is_predefined_id(bpe_codes_id): custom_bpe_config = CustomBpeConfig.from_id(bpe_codes_id) dataset = Dataset.create(str(path), prep_config, extensions, custom_bpe_config, overriden_path_to_prep_dataset=output_path) if calc_vocab: stages.run_until_vocab(dataset, custom_bpe_config) path_to_vocab = dataset.path_to_vocab_file else: stages.run_until_preprocessing(dataset, custom_bpe_config) path_to_vocab = None logger.info( f"Preprocessed dataset is ready at {dataset.preprocessed.path}") return PreprocessedCorpus(dataset.preprocessed, path_to_vocab)
def test_to_repr_no_str_no_com(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: '0', PrepParam.STR: '0', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], pl["string_literal"], pl["comment"], pl["comment"], pl["comment"], pl["comment"] ] expected_metadata = PreprocessingMetadata({'*'}, word_boundaries=[0, 5, 6, 7, 8, 9, 10, 11, 12], token_types=[Number, Operator, NonEng, StringLiteral, MultilineComment, MultilineComment, MultilineComment, OneLineComment]) assert expected == actual assert expected_metadata == actual_metadata
def test_to_repr_with_non_eng(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", 'übersetzen', '"', pl['word_start'], pl['capitals'], 'a', pl['capital'], 'wirklicä', pl['word_end'], '"', '/', '*', 'ц', pl['word_start'], 'blanco', '_', 'english', pl['word_end'], '*', '/', '/', '/', pl['word_start'], pl['capitals'], 'dieselbe', "8", pl['word_end'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/"}, word_boundaries=[0, 5, 6, 7, 8, 14, 15, 16, 17, 18, 23, 24, 25, 26, 27, 32, 33], token_types=[Number, Operator, SplitContainer] + [StringLiteral] * 3 + [MultilineComment] * 6 + [OneLineComment] * 4) assert expected == actual assert expected_metadata == actual_metadata
def test_to_repr_1_nosep(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '1', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) actual, actual_metadata = to_repr(prep_config, tokens) expected = [ '1.1', "*", pl['non_eng'], '"', pl['non_eng'], '"', '/', '*', pl['non_eng'], pl['non_eng'], '*', '/', '/', '/', pl['non_eng'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"}, word_boundaries=list(range(16+1)), token_types=[Number, Operator, NonEng, StringLiteral, StringLiteral, StringLiteral, MultilineComment, MultilineComment, MultilineComment, MultilineComment, MultilineComment, MultilineComment, OneLineComment, OneLineComment, OneLineComment, OneLineComment]) assert expected == actual assert expected_metadata == actual_metadata
def test_to_repr_0_max_str_length_7(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '7', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'u' }) actual, actual_metadata = to_repr(prep_config, tokens) expected = [ '1.1', "*", 'übersetzen', '"', '"', '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'"', "*", "/"}, word_boundaries=[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], token_types=[Number, Operator, SplitContainer, StringLiteral, MultilineComment, MultilineComment, MultilineComment, MultilineComment, MultilineComment, MultilineComment, OneLineComment, OneLineComment, OneLineComment, OneLineComment]) assert expected == actual assert expected_metadata == actual_metadata
def test_non_bpe_split_with_one_extension(get_timestamp_mock, os_exists_mock): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) actual = Dataset.create(PATH_TO_DATASET_STUB, prep_config, "java", None) assert PATH_TO_DATASET_STUB == actual._path assert prep_config == actual._prep_config assert ['java'] == actual._normalized_extension_list assert actual._custom_bpe_config is None assert actual._bpe_config is None assert '01_01_01' == actual._dataset_last_modified assert SubDataset(actual, PATH_TO_DATASET_STUB, ''), actual._original assert SubDataset( actual, os.path.join(PARSED_DATASETS_DIR, 'dataset_01_01_01_java'), '.parsed'), actual._parsed assert SubDataset( actual, os.path.join(PREP_DATASETS_DIR, 'dataset_01_01_01_java_-_uc10su'), '.prep'), actual._preprocessed
def test_to_repr_0(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'u' }) actual, actual_metadata = to_repr(prep_config, tokens) expected = [ '1.1', "*", 'übersetzen', '"', 'AWirklicä', '"', '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'"', "*", "/"}, word_boundaries=list(range(16+1)), token_types=[Number, Operator, SplitContainer, StringLiteral, StringLiteral, StringLiteral, MultilineComment, MultilineComment, MultilineComment, MultilineComment, MultilineComment, MultilineComment, OneLineComment, OneLineComment, OneLineComment, OneLineComment]) assert expected == actual assert expected_metadata == actual_metadata
def test_true_true_code_bytes(abspath_mock, bpe_learner_mock, dataset_mock): # given abspath_mock.return_value = PATH_TO_DATASET_STUB dataset_mock.create = Mock(spec=dataset_mock, return_value=dataset_mock) argv = [ 'learn-bpe', '1000', '-p', PATH_TO_DATASET_STUB, '--bytes', '--word-end' ] # when parse_and_run(argv) # then prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: '0', PrepParam.STR: 'E', PrepParam.SPLIT: 'F', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) bpe_config = BpeConfig({ BpeParam.CASE: 'yes', BpeParam.WORD_END: True, BpeParam.BASE: 'code', BpeParam.UNICODE: 'bytes', }) dataset_mock.create.assert_called_with(PATH_TO_DATASET_STUB, prep_config, None, None, bpe_config) bpe_learner_mock.run.assert_called_with(dataset_mock, 1000, bpe_config)
def test_to_repr_no_no_sep_with_bpe_no_merges(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'u' }) actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={}, merges=MergeList())) expected = [ '1', '.', '1', cwe, "*" + cwe, '÷', 'b', 'e', 'r', 's', 'e', 't', 'z', 'e', 'n', '</t>', '"', 'A', 'W', 'i', 'r', 'k', 'l', 'i', 'c', '\xf7', '\xa0', '"', cwe, '/' + cwe, '*' + cwe, '\xf7', cwe, 'b', 'l', 'a', 'n', 'c', 'o', '_', 'e', 'n', 'g', 'l', 'i', 's', 'h', cwe, '*' + cwe, '/' + cwe, '/' + cwe, '/' + cwe, 'D', 'I', 'E', 'S', 'E', 'L', 'B', 'E', '8', cwe, pl['olc_end'] + cwe ] assert expected == actual
def test_to_repr_no_nosep(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl['non_eng'], '"', '/', '*', pl['non_eng'], pl['non_eng'], '*', '/', '/', '/', pl['non_eng'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/"}, word_boundaries=[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], token_types=[Number, Operator, NonEng] + [StringLiteral] * 3 + [MultilineComment] * 6 + [OneLineComment] * 4) assert expected == actual assert expected_metadata == actual_metadata
def init_bpe_data(prep_config: PrepConfig, custom_bpe_config: Optional[CustomBpeConfig], force_reinit: bool = True): if get_global_bpe_data_if_available() and not force_reinit: return # already initialized global global_bpe_data global_bpe_data = BpeData() if custom_bpe_config: logger.info(f'Using bpe merges file: {custom_bpe_config.codes_file}') if custom_bpe_config.can_use_cache_file(): global_bpe_data.merges_cache = read_bpe_cache( custom_bpe_config.cache_file) else: global_bpe_data.merges_cache = {} global_bpe_data.merges = read_merges(custom_bpe_config.codes_file, custom_bpe_config.n_merges) if custom_bpe_config.n_merges: logger.info(f'Using first {custom_bpe_config.n_merges} merges.') nonbpe_vocab = vocabloader.nonbpe(custom_bpe_config.merge_list_id) global_bpe_data.merges_cache.update({s: [s] for s in nonbpe_vocab}) else: bpe_n_merges_dict = { '4': '5k', '5': '1k', '6': '10k', '7': '20k', '8': '0' } bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value( PrepParam.SPLIT)] bpe_merges_file = os.path.join( DEFAULT_BPE_DIR, CASE_DIR if prep_config.get_param_value( PrepParam.CASE) == 'u' else NO_CASE_DIR, str(bpe_n_merges), 'merges.txt') bpe_merges_cache_file = os.path.join( DEFAULT_BPE_CACHE_DIR, CASE_DIR if prep_config.get_param_value( PrepParam.CASE) == 'u' else NO_CASE_DIR, str(bpe_n_merges), 'merges_cache.txt') if os.path.exists(bpe_merges_cache_file): global_bpe_data.merges_cache = read_bpe_cache( bpe_merges_cache_file) else: global_bpe_data.merges_cache = {} global_bpe_data.merges = read_merges(bpe_merges_file)
def test_both_enonly_and_nosplit(): with pytest.raises(ValueError): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) to_repr(prep_config, [], BpeData())
def test_all_custom(get_timestamp_mock, os_exists_mock): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) bpe_config = BpeConfig({ BpeParam.CASE: 'yes', BpeParam.WORD_END: False, BpeParam.BASE: "code", BpeParam.UNICODE: "no", }) custom_bpe_config = CustomBpeConfig("id", 1000, "/codes/file", "/cache/file") actual = Dataset.create(PATH_TO_DATASET_STUB, prep_config, "c|java", custom_bpe_config, bpe_config, overriden_path_to_prep_dataset=OVERRIDDEN_PATH) assert PATH_TO_DATASET_STUB == actual._path assert prep_config == actual._prep_config assert ['c', 'java'] == actual._normalized_extension_list assert custom_bpe_config == actual._custom_bpe_config assert bpe_config == actual._bpe_config assert '01_01_01' == actual._dataset_last_modified assert SubDataset(actual, PATH_TO_DATASET_STUB, '') == actual.original assert SubDataset( actual, os.path.join(PARSED_DATASETS_DIR, 'dataset_01_01_01_c_java'), '.parsed') == actual.parsed assert SubDataset( actual, os.path.join(OVERRIDDEN_PATH, 'dataset_01_01_01_c_java_-_uc10su_id-1000_-_prep'), '.prep') == actual.preprocessed assert os.path.join( USER_CONFIG_DIR, VOCAB_DIR, 'dataset_01_01_01_c_java_-_U0EFsu') == actual.base_bpe_vocab_path assert os.path.join( USER_CONFIG_DIR, BPE_DIR, 'dataset_01_01_01_c_java_-_nounicode') == actual.bpe_path assert os.path.join( USER_CACHE_DIR, 'file_lists', 'dataset_01_01_01_c_java') == actual.path_to_file_list_folder assert os.path.join( USER_CONFIG_DIR, VOCAB_DIR, 'dataset_01_01_01_c_java_-_uc10su_id-1000') == actual.vocab_path
def test_xxxFsx(api_mock): argv = ['nosplit', 'str', '-e', 'java', '--full-strings'] parse_and_run(argv) prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: 'F', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) api_mock.text.preprocess.assert_called_with("str", prep_config, None, extension="java")
def test_xxx1xu(api_mock): argv = ['basic', 'str', '-e', 'java', '--no-spaces'] parse_and_run(argv) prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '1', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'u' }) api_mock.text.preprocess.assert_called_with("str", prep_config, None, extension="java")
def to_prep_config(self): return PrepConfig({ PrepParam.EN_ONLY: 'U' if self.get_param_value(BpeParam.UNICODE) == 'no' else 'u', PrepParam.COM: '0', PrepParam.STR: 'E', PrepParam.SPLIT: 'F', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' })
def test_all_short_config_options(api_mock): argv = ['basic', 'str', '-e', 'java', '-0lSCU'] parse_and_run(argv) prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: '0', PrepParam.STR: '0', PrepParam.SPLIT: '1', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) api_mock.text.preprocess.assert_called_with("str", prep_config, None, extension="java")
def test_path_short(api_mock): argv = ['nosplit', '-p', PATH_TO_DATASET_STUB, '--no-spaces'] parse_and_run(argv) prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'u' }) api_mock.corpus.preprocess_corpus.assert_called_with(PATH_TO_DATASET_STUB, prep_config, None, calc_vocab=False, extensions=None, output_path=None)
def preprocess(text: str, config: PrepConfig, bpe_codes_id: Optional[str] = None, extension: Optional[str] = None, return_metadata: bool = False, force_reinit_bpe_data: bool = True, append_eof: bool = False) \ -> Union[List[str], Tuple[List[str], PreprocessingMetadata]]: parsed = [parsed_token for parsed_token in convert_text(text, extension)] parsed = remove_trailing_newline(parsed) if append_eof: parsed.append(SpecialToken(placeholders['ect'])) if config.is_bpe(): assert bpe_codes_id custom_bpe_config = None if is_predefined_id( bpe_codes_id) else CustomBpeConfig.from_id(bpe_codes_id) init_bpe_data(config, custom_bpe_config, force_reinit_bpe_data) prep_tokens, metadata = to_repr(config, parsed) if return_metadata: return prep_tokens, metadata else: return prep_tokens
def create_prep_config_from_args(arguments: Dict) -> PrepConfig: max_str_length = get_option(arguments, '--max-str-length') max_str_length = int( max_str_length) if max_str_length is not None else sys.maxsize return PrepConfig({ PrepParam.EN_ONLY: 'U' if is_option_true(arguments, '--no-unicode') else 'u', PrepParam.COM: '0' if is_option_true(arguments, '--no-com') else 'c', PrepParam.STR: create_str_value(is_option_true(arguments, '--no-str'), max_str_length), PrepParam.SPLIT: create_split_value_from_args(arguments), PrepParam.TABS_NEWLINES: '0' if is_option_true(arguments, '--no-spaces') else 's', PrepParam.CASE: 'l' if is_option_true(arguments, '--no-case') else 'u', })
def test_bpe_string_literal_performance(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) n= 10000 tokens = [StringLiteral(['a' * n], n)] merge_list = MergeList() for i in range(1): merge_list.append(Merge(('a', 'a'), 10)) start = time.perf_counter() to_repr(prep_config, tokens, BpeData(merges=merge_list, merges_cache={'Whi@@le@': ['Whi@@le@']})) assert (time.perf_counter() - start) < 1
def test_1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) tokens = [SplitContainer.from_single_token("Whi@le")] actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']})) expected = ["Whi@le" + placeholders['compound_word_end']] expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer]) assert expected == actual assert expected_metadata == actual_metadata
def test_merges_no_cache(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '4', PrepParam.TABS_NEWLINES: 's', PrepParam.CASE: 'u' }) tokens = [SplitContainer.from_single_token("Whi@l@@e@")] actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)), merges_cache={} )) expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]] expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer]) assert expected == actual assert expected_metadata == actual_metadata
import os from unittest import mock from unittest.mock import Mock from codeprep.api.corpus import preprocess_corpus from codeprep.prepconfig import PrepConfig, PrepParam PATH_TO_CUR_DIR_STUB = os.path.join('path', 'to', 'curdir') PATH_TO_DATASET_STUB = os.path.join('path', 'to', 'dataset') PATH_TO_OUTPUT_STUB = os.path.join('path', 'to', 'output') DEFAULT_PREP_CONFIG = PrepConfig({ PrepParam.EN_ONLY: 'u', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '0', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'u', }) @mock.patch('codeprep.api.corpus.Dataset', autospec=True) @mock.patch('codeprep.api.corpus.stages', autospec=True) @mock.patch('codeprep.cli.impl.os.getcwd', autospec=True, return_value=PATH_TO_CUR_DIR_STUB) def test_simple(os_mock, stages_mock, dataset_mock): # given dataset_mock.create = Mock(spec=dataset_mock, return_value=dataset_mock) # when
def test_to_repr_with_enonlycontents1(): prep_config = PrepConfig({ PrepParam.EN_ONLY: 'U', PrepParam.COM: 'c', PrepParam.STR: '1', PrepParam.SPLIT: '2', PrepParam.TABS_NEWLINES: '0', PrepParam.CASE: 'l' }) tokens = [ Number("1.1"), Operator("*"), NonEng(SplitContainer([Word.from_("dinero")])), StringLiteral([ NonCodeChar('"'), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("weiss")])), SpaceInString(), NonEng(SplitContainer([Word.from_("nicht")])), SpaceInString(), NonEng(SplitContainer([Word.from_("was")])), SpaceInString(), NonEng(SplitContainer([Word.from_("soll")])), SpaceInString(), NonEng(SplitContainer([Word.from_("es")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bedeuten")])), SpaceInString(), NonEng(SplitContainer([Word.from_("dass")])), SpaceInString(), NonEng(SplitContainer([Word.from_("ich")])), SpaceInString(), NonEng(SplitContainer([Word.from_("so")])), SpaceInString(), NonEng(SplitContainer([Word.from_("traurig")])), SpaceInString(), NonEng(SplitContainer([Word.from_("bin")])), NonCodeChar('"'), ], 62), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng(SplitContainer([Word.from_('ц')])), NonEng( SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ), ]), MultilineComment([NonCodeChar('*'), NonCodeChar('/')]), NewLine(), Tab(), OneLineComment([NonCodeChar('/'), NonCodeChar('/'), NonEng( SplitContainer([ Word.from_("DIESELBE"), Word.from_("8") ]) ) ]) ] actual, actual_metadata = to_repr(prep_config, tokens) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"', '/', '*', pl['non_eng'], pl['non_eng'], '*', '/', '/', '/', pl['non_eng'], pl['olc_end'] ] expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"}, word_boundaries=[0] + list(range(5, 32)), token_types=[Number, Operator, NonEng] + [StringLiteral] * 14 + [MultilineComment] * 6 + [OneLineComment] * 4) assert expected == actual assert expected_metadata == actual_metadata