Ejemplo n.º 1
0
def main():
    """Preprocess raw data and produces pickled files."""
    data_dir = args.data_dir
    if args.output_dir is None:
        pickle_output_dir = data_dir
    else:
        pickle_output_dir = args.output_dir

    tx.utils.maybe_create_dir(pickle_output_dir)

    pretrained_model_dir = \
        tx.modules.PretrainedGPT2Mixin.download_checkpoint(
            pretrained_model_name=args.pretrained_model_name)

    # Creates a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder(pretrained_model_dir)

    config_train = importlib.import_module(args.config_train)

    # Produces pickle files
    data_utils.prepare_pickle_data(
        data_dir=data_dir,
        max_seq_length=args.max_seq_length,
        encoder=proc,
        output_dir=pickle_output_dir,
        feature_original_types=config_train.feature_original_types)
Ejemplo n.º 2
0
def prepare_data():
    r"""Preprocesses raw data and produces pickle files.
    """
    data_dir = args.data_dir
    if args.output_dir is None:
        pickle_output_dir = data_dir
    else:
        pickle_output_dir = args.output_dir

    tx.utils.maybe_create_dir(pickle_output_dir)

    pretrained_model_dir = tx.modules.load_pretrained_gpt2(
        pretrained_model_name=args.pretrained_model_name,
        cache_dir='gpt2_pretrained_models')

    # Creates a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder(pretrained_model_dir)

    from configs.config_train import feature_original_types

    # Produces pickle files
    data_utils.prepare_pickle_data(
        data_dir=data_dir,
        max_seq_length=args.max_seq_length,
        encoder=proc,
        output_dir=pickle_output_dir,
        feature_original_types=feature_original_types)
Ejemplo n.º 3
0
def main() -> None:
    """Preprocess raw data and produces pickled files."""
    data_dir = args.data_dir
    if args.output_dir is None:
        pickle_output_dir = data_dir
    else:
        pickle_output_dir = args.output_dir

    tx.utils.maybe_create_dir(pickle_output_dir)

    # Create a GPT-2 tokenizer (BPE encoding)
    tokenizer = tx.data.GPT2Tokenizer(
        pretrained_model_name=args.pretrained_model_name)

    config_train: Any = importlib.import_module(args.config_train)

    # Produces pickle files
    data_utils.prepare_pickle_data(data_dir=data_dir,
                                   max_seq_length=args.max_seq_length,
                                   tokenizer=tokenizer,
                                   output_dir=pickle_output_dir,
                                   feature_types=config_train.feature_types)