def to_repr(prep_config: PrepConfig, token_list: List, n_gramm_splitting_config: Optional[NgramSplitConfig] = None): types_to_be_repr = get_types_to_be_repr(prep_config) splitting_config = n_gramm_splitting_config or get_global_n_gramm_splitting_config() dict_based_non_eng = (prep_config.get_param_value(PrepParam.EN_ONLY) != 3) lowercase = (prep_config.get_param_value(PrepParam.CAPS) == 1) repr_list = to_repr_list(token_list, ReprConfig(types_to_be_repr, splitting_config, dict_based_non_eng, lowercase)) return repr_list
def init_splitting_config(dataset: str, prep_config: PrepConfig, bpe_base_repr: Optional[str], bpe_n_merges: Optional[int], splitting_file: Optional[str]): global global_n_gramm_splitting_config global_n_gramm_splitting_config = NgramSplitConfig() if prep_config.get_param_value(PrepParam.SPLIT) in [4, 5, 6, 7, 8, 9]: if not bpe_base_repr: bpe_base_repr = prep_config.get_base_bpe_prep_config() if prep_config.get_param_value(PrepParam.SPLIT) == 9: if not bpe_n_merges: raise ValueError( "--bpe-n-merges must be specified for repr **9**") else: bpe_n_merges_dict = {4: 5000, 5: 1000, 6: 10000, 7: 20000, 8: 0} bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value( PrepParam.SPLIT)] if bpe_base_repr.find("/") == -1: bpe_base_dataset = dataset else: bpe_base_dataset, bpe_base_repr = bpe_base_repr.split("/") logger.info(f'Using bpe base dataset: {bpe_base_dataset}') logger.info(f'Using bpe base repr: {bpe_base_repr}') logger.info(f'Using bpe_n_merges: {bpe_n_merges}') path_to_merges_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR, bpe_base_dataset, METADATA_DIR, bpe_base_repr, BPE_DIR, str(bpe_n_merges)) bpe_merges_file = os.path.join(path_to_merges_dir, 'merges.txt') bpe_merges_cache = os.path.join(path_to_merges_dir, 'merges_cache.txt') global_n_gramm_splitting_config.merges_cache = read_dict_from_2_columns( bpe_merges_cache, val_type=list) global_n_gramm_splitting_config.merges = read_merges(bpe_merges_file) global_n_gramm_splitting_config.set_splitting_type( NgramSplittingType.BPE) elif prep_config.get_param_value(PrepParam.SPLIT) == 3: if not splitting_file: raise ValueError("--splitting-file must be specified") splittings = read_dict_from_2_columns(splitting_file, val_type=list, delim='|') global_n_gramm_splitting_config.sc_splittings = splittings global_n_gramm_splitting_config.set_splitting_type( NgramSplittingType.NUMBERS_AND_CUSTOM) elif prep_config.get_param_value(PrepParam.SPLIT) == 2: global_n_gramm_splitting_config.set_splitting_type( NgramSplittingType.ONLY_NUMBERS)