Example #1
0
        decoder_tokenizer_legacy=args.decoder_tokenizer_legacy,
    )

    _, _ = MTDataPreproc.preprocess_parallel_dataset(
        clean=args.clean,
        src_fname=args.src_fname,
        tgt_fname=args.tgt_fname,
        out_dir=args.out_dir,
        encoder_tokenizer_name=args.encoder_tokenizer_name,
        encoder_model_name=args.encoder_model_name,
        encoder_tokenizer_model=encoder_tokenizer_model,
        encoder_bpe_dropout=args.encoder_tokenizer_bpe_dropout,
        encoder_tokenizer_r2l=args.encoder_tokenizer_r2l,
        decoder_tokenizer_name=args.decoder_tokenizer_name,
        decoder_model_name=args.decoder_model_name,
        decoder_tokenizer_model=decoder_tokenizer_model,
        decoder_tokenizer_r2l=args.decoder_tokenizer_r2l,
        decoder_bpe_dropout=args.decoder_tokenizer_bpe_dropout,
        max_seq_length=args.max_seq_length,
        min_seq_length=args.min_seq_length,
        tokens_in_batch=args.tokens_in_batch,
        lines_per_dataset_fragment=args.lines_per_dataset_fragment,
        num_batches_per_tarfile=args.num_batches_per_tarfile,
        tar_file_prefix=args.tar_file_prefix,
        global_rank=0,
        world_size=1,
        n_jobs=args.n_preproc_jobs,
        encoder_tokenizer_legacy=args.encoder_tokenizer_legacy,
        decoder_tokenizer_legacy=args.decoder_tokenizer_legacy,
    )
            global_rank=0,
        )
    else:
        encoder_tokenizer_model, decoder_tokenizer_model = args.encoder_tokenizer_model, args.decoder_tokenizer_model

    encoder_tokenizer, decoder_tokenizer = MTDataPreproc.get_enc_dec_tokenizers(
        encoder_tokenizer_name=args.encoder_tokenizer_name,
        encoder_tokenizer_model=encoder_tokenizer_model,
        encoder_bpe_dropout=args.encoder_tokenizer_bpe_dropout,
        decoder_tokenizer_name=args.decoder_tokenizer_name,
        decoder_tokenizer_model=decoder_tokenizer_model,
        decoder_bpe_dropout=args.decoder_tokenizer_bpe_dropout,
    )

    _, _ = MTDataPreproc.preprocess_parallel_dataset(
        clean=args.clean,
        src_fname=args.src_fname,
        tgt_fname=args.tgt_fname,
        out_dir=args.out_dir,
        encoder_tokenizer=encoder_tokenizer,
        decoder_tokenizer=decoder_tokenizer,
        max_seq_length=args.max_seq_length,
        min_seq_length=args.min_seq_length,
        tokens_in_batch=args.tokens_in_batch,
        lines_per_dataset_fragment=args.lines_per_dataset_fragment,
        num_batches_per_tarfile=args.num_batches_per_tarfile,
        pkl_file_prefix=args.pkl_file_prefix,
        global_rank=0,
        world_size=1,
    )