Exemple #1
0
def __process_data(
    text_path: str,
    dst_folder: str,
    vocab_size: int,
    tokenizer_type: str,
    spe_type: str,
    spe_character_coverage: float,
    lower_case: bool,
):
    """
    Converts flac to wav and build manifests's json
    Args:
        text_path: source with text lines
        dst_folder: where wav files will be stored
        vocab_size: vocabular size used in encoding the text
        tokenizer_type: type of tokenization to perform - wpe or spe
        spe_type: type of tokenization model used for spe.
        spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
            can be < 1.0, but for all other languages, it should be set as 1.0
        lower_case: whether to tokenize with lower case character set only (for english)

    Returns:
    """
    if tokenizer_type == 'spe':
        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format(tokenizer_type, spe_type, vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
            logging.warning("Model file already exists, overriding old model file !")
            os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))

        tokenizer_path, vocab_path = create_spt_model(
            data_file=text_path,
            vocab_size=vocab_size,
            sample_size=-1,
            do_lower_case=lower_case,
            output_dir=tokenizer_dir,
            tokenizer_type=spe_type,
            character_coverage=spe_character_coverage,
        )

    else:
        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(tokenizer_type, vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case)

        tokenizer.train(text_path, vocab_size=vocab_size)
        tokenizer.save_model(tokenizer_dir)

    return tokenizer_dir
def __process_data(text_path: str, dst_folder: str, vocab_size: int,
                   tokenizer_type: str, spe_type: str):
    """
    Converts flac to wav and build manifests's json
    Args:
        text_path: source with text lines
        dst_folder: where wav files will be stored
        vocab_size: vocabular size used in encoding the text
        tokenizer_type: type of tokenization to perform - wpe or spe
        spe_type: type of tokenization model used for spe.

    Returns:
    """
    tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(
        tokenizer_type, vocab_size)

    if not os.path.exists(tokenizer_dir):
        os.makedirs(tokenizer_dir)

    if tokenizer_type == 'spe':
        if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
            logging.warning(
                "Model file already exists, overriding old model file !")
            os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))

        tokenizer_path, vocab_path = create_spt_model(
            data_file=text_path,
            vocab_size=vocab_size,
            sample_size=-1,
            do_lower_case=True,
            output_dir=tokenizer_dir,
            tokenizer_type=spe_type,
        )

    else:
        tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=True)

        tokenizer.train(text_path, vocab_size=vocab_size)
        tokenizer.save_model(tokenizer_dir)

    return tokenizer_dir
Exemple #3
0
    def train_tokenizers(
        out_dir,
        src_fname,
        tgt_fname,
        shared_tokenizer,
        encoder_tokenizer_name,
        encoder_tokenizer_vocab_size,
        encoder_tokenizer_coverage,
        decoder_tokenizer_name,
        decoder_tokenizer_vocab_size,
        decoder_tokenizer_coverage,
        global_rank,
        encoder_training_sample_size=-1,
        decoder_training_sample_size=-1,
        encoder_special_tokens=None,
        decoder_special_tokens=None,
        spt_symbols=None,
        multilingual=False,
    ):
        encoder_tokenizer_model = None
        decoder_tokenizer_model = None
        os.makedirs(out_dir, exist_ok=True)

        supported_train_tokenizers = ['yttm', 'sentencepiece']

        if encoder_special_tokens:
            if isinstance(encoder_special_tokens, dict):
                encoder_special_tokens = list(encoder_special_tokens.values())
                print(encoder_special_tokens)

        if decoder_special_tokens:
            if isinstance(decoder_special_tokens, dict):
                decoder_special_tokens = list(decoder_special_tokens.values())

        if multilingual and encoder_tokenizer_name != 'sentencepiece':
            raise NotImplementedError(
                f"Currently we only support training setencepiece tokenizer for multilingual model."
            )

        if shared_tokenizer:
            if (
                encoder_tokenizer_name not in supported_train_tokenizers
                or decoder_tokenizer_name not in supported_train_tokenizers
            ):
                raise NotImplementedError(
                    f"Currently we only support tokenizers in {supported_train_tokenizers} for shared tokenizer."
                )

            encoder_tokenizer_model = os.path.join(
                out_dir, 'shared_tokenizer.%d.BPE.model' % (encoder_tokenizer_vocab_size)
            )
            decoder_tokenizer_model = encoder_tokenizer_model
            if global_rank == 0:
                if os.path.isfile(encoder_tokenizer_model):
                    logging.info(
                        f'Shared tokenizer model {encoder_tokenizer_model} already exists. Remove file if training a new tokenizer model.'
                    )
                else:
                    logging.info(
                        f'Shared tokenizer model {encoder_tokenizer_model} not found. Training tokenizer model.'
                    )
                    with tempfile.TemporaryDirectory() as tmp:
                        concat_data_path = os.path.join(tmp, 'concat_dataset.txt')
                        os.system('cat %s %s > %s' % (src_fname, tgt_fname, concat_data_path))
                        if encoder_tokenizer_name == "yttm":
                            yttm.BPE.train(
                                data=concat_data_path,
                                vocab_size=encoder_tokenizer_vocab_size,
                                model=os.path.join(out_dir, encoder_tokenizer_model),
                                coverage=encoder_tokenizer_coverage,
                                n_threads=-1,
                            )
                        else:
                            create_spt_model(
                                data_file=concat_data_path,
                                vocab_size=encoder_tokenizer_vocab_size,
                                sample_size=encoder_training_sample_size,
                                do_lower_case=False,
                                tokenizer_type='bpe',
                                character_coverage=encoder_tokenizer_coverage,
                                output_dir=out_dir,
                                bos=True,
                                eos=True,
                                pad=True,
                                control_symbols=spt_symbols,
                                user_defined_symbols=encoder_special_tokens,
                            )
                            os.rename(
                                os.path.join(out_dir, 'tokenizer.model'),
                                os.path.join(out_dir, encoder_tokenizer_model),
                            )
        else:
            if encoder_tokenizer_name in supported_train_tokenizers:
                encoder_tokenizer_model = os.path.join(
                    out_dir, 'tokenizer.encoder.%d.BPE.model' % (encoder_tokenizer_vocab_size)
                )
                if global_rank == 0:
                    if os.path.isfile(encoder_tokenizer_model):
                        logging.info(
                            f'Encoder tokenizer model {encoder_tokenizer_model} already exists. Remove file if training a new tokenizer model.'
                        )
                    else:
                        logging.info(
                            f'Encoder tokenizer model {encoder_tokenizer_model} not found. Training tokenizer model.'
                        )
                        if encoder_tokenizer_name == "yttm":
                            yttm.BPE.train(
                                data=src_fname,
                                vocab_size=encoder_tokenizer_vocab_size,
                                model=encoder_tokenizer_model,
                                coverage=encoder_tokenizer_coverage,
                                n_threads=-1,
                            )
                        else:
                            dir_name = os.path.dirname(encoder_tokenizer_model)
                            create_spt_model(
                                data_file=src_fname,
                                vocab_size=encoder_tokenizer_vocab_size,
                                sample_size=encoder_training_sample_size,
                                do_lower_case=False,
                                tokenizer_type='bpe',
                                character_coverage=encoder_tokenizer_coverage,
                                output_dir=dir_name,
                                bos=True,
                                eos=True,
                                pad=True,
                                control_symbols=spt_symbols,
                                user_defined_symbols=encoder_special_tokens,
                            )
                            os.rename(os.path.join(dir_name, 'tokenizer.model'), os.path.join(encoder_tokenizer_model))

            if decoder_tokenizer_name in supported_train_tokenizers:
                decoder_tokenizer_model = os.path.join(
                    out_dir, 'tokenizer.decoder.%d.BPE.model' % (decoder_tokenizer_vocab_size)
                )
                if global_rank == 0:
                    if os.path.isfile(decoder_tokenizer_model):
                        logging.info(
                            f'Decoder tokenizer model {decoder_tokenizer_model} already exists. Remove file if training a new tokenizer model.'
                        )
                    else:
                        logging.info(
                            f'Decoder tokenizer model {decoder_tokenizer_model} not found. Training tokenizer model.'
                        )
                        if decoder_tokenizer_name == "yttm":
                            yttm.BPE.train(
                                data=tgt_fname,
                                vocab_size=decoder_tokenizer_vocab_size,
                                model=decoder_tokenizer_model,
                                coverage=decoder_tokenizer_coverage,
                                n_threads=-1,
                            )
                        else:
                            dir_name = os.path.dirname(decoder_tokenizer_model)
                            create_spt_model(
                                data_file=tgt_fname,
                                vocab_size=decoder_tokenizer_vocab_size,
                                sample_size=decoder_training_sample_size,
                                do_lower_case=False,
                                tokenizer_type='bpe',
                                character_coverage=decoder_tokenizer_coverage,
                                output_dir=dir_name,
                                bos=True,
                                eos=True,
                                pad=True,
                                control_symbols=spt_symbols,
                                user_defined_symbols=decoder_special_tokens,
                            )
                            os.rename(os.path.join(dir_name, 'tokenizer.model'), os.path.join(decoder_tokenizer_model))

        return encoder_tokenizer_model, decoder_tokenizer_model
Exemple #4
0
def __process_data(
    text_path: str,
    dst_folder: str,
    vocab_size: int,
    tokenizer_type: str,
    spe_type: str,
    spe_character_coverage: float,
    spe_train_extremely_large_corpus: bool,
    spe_sample_size: int,
    spe_max_sentencepiece_length: int,
    lower_case: bool,
):
    """
    Converts flac to wav and build manifests's json
    Args:
        text_path: source with text lines
        dst_folder: where wav files will be stored
        vocab_size: vocabular size used in encoding the text
        tokenizer_type: type of tokenization to perform - wpe or spe
        spe_type: type of tokenization model used for spe.
        spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
            can be < 1.0, but for all other languages, it should be set as 1.0
        spe_sample_size: int, default of -1. If positive integer is used, samples the dataset
            by given sample size.
        spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM,
            this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM.
        spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
            By default, no limit is placed.
        lower_case: whether to tokenize with lower case character set only (for english)

    Returns:
    """
    if tokenizer_type == 'spe':
        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format(
            tokenizer_type, spe_type, vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
            logging.warning(
                "Model file already exists, overriding old model file !")
            os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))

        tokenizer_path, vocab_path = create_spt_model(
            data_file=text_path,
            vocab_size=vocab_size,
            sample_size=spe_sample_size,
            do_lower_case=lower_case,
            output_dir=tokenizer_dir,
            tokenizer_type=spe_type,
            character_coverage=spe_character_coverage,
            train_extremely_large_corpus=spe_train_extremely_large_corpus,
            max_sentencepiece_length=spe_max_sentencepiece_length,
        )

    else:
        tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(
            tokenizer_type, vocab_size)

        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir)

        tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case)

        tokenizer.train(text_path, vocab_size=vocab_size)
        tokenizer.save_model(tokenizer_dir)

    return tokenizer_dir