Ejemplo n.º 1
0
def to_repr(prep_config: PrepConfig, token_list: List, n_gramm_splitting_config: Optional[NgramSplitConfig] = None):
    types_to_be_repr = get_types_to_be_repr(prep_config)
    splitting_config = n_gramm_splitting_config or get_global_n_gramm_splitting_config()
    dict_based_non_eng = (prep_config.get_param_value(PrepParam.EN_ONLY) != 3)
    lowercase = (prep_config.get_param_value(PrepParam.CAPS) == 1)
    repr_list = to_repr_list(token_list, ReprConfig(types_to_be_repr, splitting_config, dict_based_non_eng, lowercase))
    return repr_list
Ejemplo n.º 2
0
def init_splitting_config(dataset: str, prep_config: PrepConfig,
                          bpe_base_repr: Optional[str],
                          bpe_n_merges: Optional[int],
                          splitting_file: Optional[str]):
    global global_n_gramm_splitting_config
    global_n_gramm_splitting_config = NgramSplitConfig()
    if prep_config.get_param_value(PrepParam.SPLIT) in [4, 5, 6, 7, 8, 9]:
        if not bpe_base_repr:
            bpe_base_repr = prep_config.get_base_bpe_prep_config()

        if prep_config.get_param_value(PrepParam.SPLIT) == 9:
            if not bpe_n_merges:
                raise ValueError(
                    "--bpe-n-merges must be specified for repr **9**")
        else:
            bpe_n_merges_dict = {4: 5000, 5: 1000, 6: 10000, 7: 20000, 8: 0}
            bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value(
                PrepParam.SPLIT)]

        if bpe_base_repr.find("/") == -1:
            bpe_base_dataset = dataset
        else:
            bpe_base_dataset, bpe_base_repr = bpe_base_repr.split("/")
        logger.info(f'Using bpe base dataset: {bpe_base_dataset}')
        logger.info(f'Using bpe base repr: {bpe_base_repr}')
        logger.info(f'Using bpe_n_merges: {bpe_n_merges}')
        path_to_merges_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR,
                                          bpe_base_dataset, METADATA_DIR,
                                          bpe_base_repr, BPE_DIR,
                                          str(bpe_n_merges))
        bpe_merges_file = os.path.join(path_to_merges_dir, 'merges.txt')
        bpe_merges_cache = os.path.join(path_to_merges_dir, 'merges_cache.txt')

        global_n_gramm_splitting_config.merges_cache = read_dict_from_2_columns(
            bpe_merges_cache, val_type=list)
        global_n_gramm_splitting_config.merges = read_merges(bpe_merges_file)
        global_n_gramm_splitting_config.set_splitting_type(
            NgramSplittingType.BPE)
    elif prep_config.get_param_value(PrepParam.SPLIT) == 3:
        if not splitting_file:
            raise ValueError("--splitting-file must be specified")

        splittings = read_dict_from_2_columns(splitting_file,
                                              val_type=list,
                                              delim='|')
        global_n_gramm_splitting_config.sc_splittings = splittings
        global_n_gramm_splitting_config.set_splitting_type(
            NgramSplittingType.NUMBERS_AND_CUSTOM)
    elif prep_config.get_param_value(PrepParam.SPLIT) == 2:
        global_n_gramm_splitting_config.set_splitting_type(
            NgramSplittingType.ONLY_NUMBERS)