Ejemplo n.º 1
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU')
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--tpu_name', required=False, help='Name of the TPU')
    parser.add_argument('--tpu_name_project', required=False, help='Name of the TPU project')
    parser.add_argument('--pretrain_data', required=True, type=str, help='Folder which contains pretrain data. Should be located under gs://{bucket_name}/{project_name}/pretrain/pretrain_data/')
    parser.add_argument('--run_prefix', help='Prefix to be added to all runs. Useful to group runs')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use')
    parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket')
    parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use')
    parser.add_argument('--eval_steps', default=1000, type=int, help='Number eval steps to run (only active when --do_eval flag is provided)')
    parser.add_argument('--init_checkpoint', default=None, help='Run name to initialize checkpoint from. Example: "run2/ctl_step_8000.ckpt-8". or "run2/pretrained/bert_model_8000.ckpt-8". The first contains the mlm/nsp layers. \
            By default using a pretrained model from gs://{bucket_name}/pretrained_models/')
    parser.add_argument('--load_mlm_nsp_weights', default=None, help="If set to True it will load the mlm/nsp-layers. The init_checkpoint should then be set to a model containing these. Usually in base run-directory named 'ctl_step*'.")
    parser.add_argument('--set_trainstep', default=None, help="If set this will set the trainstep. This is only needed when restarting from an old checkpoint and you would like to get the scheduler/optimiser to start at the correct point.")
    parser.add_argument('--optimizer_type', default='adamw', choices=['adamw', 'lamb'], type=str, help='Optimizer')
    parser.add_argument('--train_batch_size', default=32, type=int, help='Training batch size')
    parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size')
    parser.add_argument('--num_epochs', default=3, type=int, help='Number of epochs')
    parser.add_argument('--num_steps_per_epoch', default=1000, type=int, help='Number of steps per epoch')
    parser.add_argument('--warmup_steps', default=10000, type=int, help='Warmup steps')
    parser.add_argument('--warmup_proportion', default=None, type=float, help='If set overwrites warmup_steps.')
    parser.add_argument('--learning_rate', default=2e-5, type=float, help='Learning rate')
    parser.add_argument('--end_lr', default=0, type=float, help='Final learning rate')
    parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length. Sequences longer than this will be truncated, and sequences shorter than this will be padded.')
    parser.add_argument('--max_predictions_per_seq', default=14, type=int, help='Maximum predictions per sequence_output.')
    parser.add_argument('--dtype', default='fp32', choices=['fp32', 'bf16', 'fp16'], type=str, help='Data type')
    parser.add_argument('--steps_per_loop', default=10, type=int, help='Steps per loop')
    parser.add_argument('--time_history_log_steps', default=1000, type=int, help='Frequency with which to log timing information with TimeHistory.')
    add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU')
    add_bool_arg(parser, 'do_eval', default=False, help='Run evaluation (make sure eval data is present in tfrecords folder)')
    args = parser.parse_args()
    return args
Ejemplo n.º 2
0
def parse_args():
    # Parse commandline
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--path_to_huggingface',
        type=str,
        required=True,
        help=
        'Path to folder containing pytorch model/config.json for huggingface')
    parser.add_argument(
        '--path_to_tf',
        type=str,
        required=True,
        help='Path to folder containing tf checkpoint and bert_config.json')
    parser.add_argument('--input_text',
                        default='this is some example test',
                        required=False,
                        help='Test example')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument('--model_class_huggingface',
                        default='bert-large-uncased',
                        help='Model class to use for huggingface tokenizer')
    args = parser.parse_args()
    return args
Ejemplo n.º 3
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--init_checkpoint', help='Path to checkpoint')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use')
    parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket')
    parser.add_argument('--output', default=['tf_hub', 'huggingface'], choices=['tf_hub', 'huggingface'], nargs='+', help='Generate output for those model types')
    args = parser.parse_args()
    return args
Ejemplo n.º 4
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault(usage=USAGE_DESCRIPTION)
    parser.add_argument('--run_name', required=True, help='Finetune run name. The model will be loaded from gs://{bucket_name}/{project_name}/finetune/runs/{run_name}.')
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--project_name', required=False, default='covid-bert', help='Name of subfolder in Google bucket')
    parser.add_argument('--input_text', required=False, help='Predict arbitrary input text and print prediction to stdout')
    parser.add_argument('--input_txt_files', nargs='+', required=False, help='Predict text from local txt files. One example per line.')
    parser.add_argument('--input_tfrecord_files', nargs='+', required=False, help='Predict text from tfrecord files (local or on bucket).')
    parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use')
    parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use')
    parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size')
    parser.add_argument('--label_name', default='label', type=str, help='Name of label to predicted')
    add_bool_arg(parser, 'interactive_mode', default=False, help='Interactive mode')
    add_bool_arg(parser, 'use_tpu', default=False, help='Use TPU (only works when using input_tfrecord_files stored on a Google bucket)')
    args = parser.parse_args()
    return args
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_data', required=True, help='Path to folder with txt files. \
            Folder may contain train/dev/test subfolders. Each txt file contains the text of a single tweet per line.')
    parser.add_argument('--run_prefix', help='Prefix to be added to all runs. Useful to identify runs')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class')
    parser.add_argument('--username_filler', default='twitteruser', type=str, help='Username filler')
    parser.add_argument('--url_filler', default='twitterurl', type=str, help='URL filler (ignored when replace_urls option is false)')
    parser.add_argument('--num_logged_samples', default=10, type=int, help='Log first n samples to output')
    add_bool_arg(parser, 'run_in_parallel', default=True, help='Run script in parallel')
    add_bool_arg(parser, 'replace_usernames', default=True, help='Replace usernames with filler')
    add_bool_arg(parser, 'replace_urls', default=True, help='Replace URLs with filler')
    add_bool_arg(parser, 'asciify_emojis', default=True, help='Asciifyi emojis')
    add_bool_arg(parser, 'replace_multiple_usernames', default=True, help='Replace "@user @user" with "2 <username_filler>"')
    add_bool_arg(parser, 'replace_multiple_urls', default=True, help='Replace "http://... http://.." with "2 <url_filler>"')
    add_bool_arg(parser, 'remove_unicode_symbols', default=True, help='After preprocessing remove characters which belong to unicode category "So"')
    add_bool_arg(parser, 'remove_accented_characters', default=False, help='Remove accents/asciify everything. Probably not recommended.')
    args = parser.parse_args()
    return args
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--finetune_data', required=True, help='Finetune data folder sub path. Path has to be in gs://{bucket_name}/{project_name}/finetune/finetune_data/{finetune_data}.\
                    This folder includes a meta.json (containing meta info about the dataset), and a file label_mapping.json. \
                    TFrecord files (train.tfrecords and dev.tfrecords) should be located in a \
                    subfolder gs://{bucket_name}/{project_name}/finetune/finetune_data/{finetune_data}/tfrecords/')
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU')
    parser.add_argument('--preemptible_tpu', default=False, action='store_true', required=False, help='Dynamically create preemptible TPU (this requires you to have glcoud installed with suitable permissions)')
    parser.add_argument('--preemptible_tpu_zone', default='us-central1-f', type=str, required=False, help='Preemptible TPU zone (only if --preemptible_tpu flag is provided)')
    parser.add_argument('--preemptible_tpu_name', default=None, type=str, required=False, help='Preemptible TPU name (only if --preemptible_tpu flag is provided)')
    parser.add_argument('--preemptible_tpu_version', default='nightly', choices=['nightly', '2.1'], type=str, required=False, help='Preemptible TPU version (only if --preemptible_tpu flag is provided)')
    parser.add_argument('--run_prefix', help='Prefix to be added to all runs. Useful to group runs')
    parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use')
    parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use')
    parser.add_argument('--init_checkpoint', default=None, help='Run name to initialize checkpoint from. Example: "run2/ctl_step_8000.ckpt-8". \
            By default using a pretrained model from gs://{bucket_name}/pretrained_models/')
    parser.add_argument('--init_checkpoint_index', type=int, help='Checkpoint index. This argument is ignored and only added for reporting.')
    parser.add_argument('--repeats', default=1, type=int, help='Number of times the script should run. Default is 1')
    parser.add_argument('--num_epochs', default=3, type=int, help='Number of epochs')
    parser.add_argument('--limit_train_steps', type=int, help='Limit the number of train steps per epoch. Useful for testing.')
    parser.add_argument('--limit_eval_steps', type=int, help='Limit the number of eval steps per epoch. Useful for testing.')
    parser.add_argument('--train_batch_size', default=32, type=int, help='Training batch size')
    parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size')
    parser.add_argument('--learning_rate', default=2e-5, type=float, help='Learning rate')
    parser.add_argument('--end_lr', default=0, type=float, help='Final learning rate')
    parser.add_argument('--warmup_proportion', default=0.1, type=float, help='Learning rate warmup proportion')
    parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length')
    parser.add_argument('--early_stopping_epochs', default=-1, type=int, help='Stop when loss hasn\'t decreased during n epochs')
    parser.add_argument('--optimizer_type', default='adamw', choices=['adamw', 'lamb'], type=str, help='Optimizer')
    parser.add_argument('--dtype', default='fp32', choices=['fp32', 'bf16', 'fp16'], type=str, help='Data type')
    parser.add_argument('--steps_per_loop', default=10, type=int, help='Steps per loop (unavailable for Keras fit in TF 2.2, will be added in later version)')
    parser.add_argument('--time_history_log_steps', default=10, type=int, help='Frequency with which to log timing information with TimeHistory.')
    add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU')
    args = parser.parse_args()
    return args
Ejemplo n.º 7
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_txt_files',
                        type=str,
                        nargs='+',
                        help='Input txt files to process.')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class')
    parser.add_argument('--run_prefix', help='Run prefix')
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument('--username_filler',
                        default='twitteruser',
                        type=str,
                        help='Username filler')
    parser.add_argument(
        '--url_filler',
        default='twitterurl',
        type=str,
        help='URL filler (ignored when replace_urls option is false)')
    add_bool_arg(parser,
                 'replace_usernames',
                 default=True,
                 help='Replace usernames with filler')
    add_bool_arg(parser,
                 'replace_urls',
                 default=True,
                 help='Replace URLs with filler')
    add_bool_arg(parser,
                 'asciify_emojis',
                 default=True,
                 help='Asciifyi emojis')
    add_bool_arg(parser,
                 'replace_multiple_usernames',
                 default=True,
                 help='Replace "@user @user" with "2 <username_filler>"')
    add_bool_arg(parser,
                 'replace_multiple_urls',
                 default=True,
                 help='Replace "http://... http://.." with "2 <url_filler>"')
    add_bool_arg(
        parser,
        'remove_unicode_symbols',
        default=True,
        help=
        'After preprocessing remove characters which belong to unicode category "So"'
    )
    add_bool_arg(parser,
                 'standardize_punctuation',
                 default=True,
                 help='Standardize (asciifyi) special punctuation')
    add_bool_arg(
        parser,
        'remove_accented_characters',
        default=False,
        help='Remove accents/asciify everything. Probably not recommended.')
    add_bool_arg(parser,
                 'write_preprocessed_file',
                 default=True,
                 help='Write preprocess output file')
    add_bool_arg(parser,
                 'run_in_parallel',
                 default=True,
                 help='Run script in parallel')
    return parser.parse_args()
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument(
        '--run_name',
        required=True,
        help=
        'Run name to create tf record files for. Run folder has to be located under \
            data/pretrain/{run_name}/preprocessed/ and must contain one or multiple txt files. May also contain train and dev subfolders with txt files.'
    )
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument(
        '--dupe_factor',
        default=10,
        type=int,
        help=
        'Number of times to duplicate the input data (with different masks).')
    parser.add_argument(
        '--short_seq_prob',
        default=0.1,
        type=float,
        help=
        'Probability of creating sequences which are shorter than the maximum length.'
    )
    parser.add_argument(
        '--max_predictions_per_seq',
        default=14,
        type=int,
        help='Maximum number of masked LM predictions per sequence.')
    parser.add_argument('--random_seed',
                        default=42,
                        type=int,
                        help='Random seed')
    parser.add_argument('--masked_lm_prob',
                        default=0.15,
                        type=float,
                        help='Masked LM probabibility')
    parser.add_argument('--gzipped',
                        action='store_true',
                        default=False,
                        help='Create gzipped tfrecords files')
    parser.add_argument('--num_logged_samples',
                        default=10,
                        type=int,
                        help='Log first n samples to output')
    parser.add_argument(
        '--max_num_cpus',
        default=10,
        type=int,
        help=
        'Adapt this number based on the available memory/size of input files. \
            This code was tested on a machine with a lot of memory (250GB). Decrease this number if you run into memory issues.'
    )
    add_bool_arg(parser,
                 'run_in_parallel',
                 default=True,
                 help='Run script in parallel')
    return parser.parse_args()
Ejemplo n.º 9
0
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True, type=str, help="Input to tokenize")
    parser.add_argument('--model_class', default='covid-twitter-bert', choices=list(PRETRAINED_MODELS.keys()), help="Model class")
    args = parser.parse_args()
    return args
Ejemplo n.º 10
0
        type='laptop',
        folds=FOLDS,
        return_val_idxs=True)
    VAL_IDX = []
    LB, GT = [], []
    for idxs in val_idxs:
        VAL_IDX.extend(idxs)
    for train, val in cv_loader:
        for ((rv_raw, lb_raw), x, y) in val:
            LB.extend(lb_raw)
            GT.extend(rv_raw)
    tokenizers = dict([
        (model_name,
         BertTokenizer.from_pretrained(model_config['path'],
                                       do_lower_case=True))
        for model_name, model_config in PRETRAINED_MODELS.items()
    ])
    # print(tokenizers)

    cv_loaders = dict([
        (model_name,
         get_data_loaders_cv(rv_path='../data/TRAIN/Train_laptop_reviews.csv',
                             lb_path='../data/TRAIN/Train_laptop_labels.csv',
                             tokenizer=tokenizers[model_name],
                             batch_size=args.bs,
                             type='laptop',
                             folds=FOLDS))
        for model_name, model_config in PRETRAINED_MODELS.items()
    ])

    PRED = []
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--tpu_ip',
                        required=True,
                        help='IP-address of the TPU')
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--tpu_name', required=False, help='Name of the TPU')
    parser.add_argument('--tpu_name_project',
                        required=False,
                        help='Name of the TPU project')
    parser.add_argument(
        '--pretrain_data',
        required=True,
        type=str,
        help=
        'Folder which contains pretrain data. Should be located under gs://{bucket_name}/{project_name}/pretrain/pretrain_data/'
    )
    parser.add_argument(
        '--run_prefix',
        help='Prefix to be added to all runs. Useful to group runs')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument('--project_name',
                        default='covid-bert',
                        help='Name of subfolder in Google bucket')
    parser.add_argument('--num_gpus',
                        default=1,
                        type=int,
                        help='Number of GPUs to use')
    parser.add_argument(
        '--eval_steps',
        default=1000,
        type=int,
        help=
        'Number eval steps to run (only active when --do_eval flag is provided)'
    )
    parser.add_argument('--optimizer_type',
                        default='adamw',
                        choices=['adamw', 'lamb'],
                        type=str,
                        help='Optimizer')
    parser.add_argument('--train_batch_size',
                        default=32,
                        type=int,
                        help='Training batch size')
    parser.add_argument('--eval_batch_size',
                        default=32,
                        type=int,
                        help='Eval batch size')
    parser.add_argument('--num_epochs',
                        default=3,
                        type=int,
                        help='Number of epochs')
    parser.add_argument('--num_steps_per_epoch',
                        default=1000,
                        type=int,
                        help='Number of steps per epoch')
    parser.add_argument('--warmup_steps',
                        default=10000,
                        type=int,
                        help='Warmup steps')
    parser.add_argument('--learning_rate',
                        default=2e-5,
                        type=float,
                        help='Learning rate')
    parser.add_argument('--end_lr',
                        default=0,
                        type=float,
                        help='Final learning rate')
    parser.add_argument(
        '--max_seq_length',
        default=96,
        type=int,
        help=
        'Maximum sequence length. Sequences longer than this will be truncated, and sequences shorter than this will be padded.'
    )
    parser.add_argument('--max_predictions_per_seq',
                        default=14,
                        type=int,
                        help='Maximum predictions per sequence_output.')
    parser.add_argument('--dtype',
                        default='fp32',
                        choices=['fp32', 'bf16', 'fp16'],
                        type=str,
                        help='Data type')
    parser.add_argument('--steps_per_loop',
                        default=10,
                        type=int,
                        help='Steps per loop')
    parser.add_argument(
        '--time_history_log_steps',
        default=1000,
        type=int,
        help='Frequency with which to log timing information with TimeHistory.'
    )
    add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU')
    add_bool_arg(
        parser,
        'do_eval',
        default=False,
        help=
        'Run evaluation (make sure eval data is present in tfrecords folder)')
    args = parser.parse_args()
    return args
Ejemplo n.º 12
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument(
        '--finetune_datasets',
        type=str,
        nargs='+',
        help=
        'Finetune dataset(s) to process. These correspond to folder names in data/finetune. \
            Data should be located in data/finetune/originals/{finetune_dataset}/[train.tsv/dev.tsv/test.tsv]. By default runs all datasets.'
    )
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class')
    parser.add_argument(
        '--run_prefix',
        help='Prefix to be added to all runs. Useful to identify runs')
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument('--username_filler',
                        default='twitteruser',
                        type=str,
                        help='Username filler')
    parser.add_argument(
        '--url_filler',
        default='twitterurl',
        type=str,
        help='URL filler (ignored when replace_urls option is false)')
    add_bool_arg(parser,
                 'replace_usernames',
                 default=True,
                 help='Replace usernames with filler')
    add_bool_arg(parser,
                 'replace_urls',
                 default=True,
                 help='Replace URLs with filler')
    add_bool_arg(parser,
                 'asciify_emojis',
                 default=True,
                 help='Asciifyi emojis')
    add_bool_arg(parser,
                 'replace_multiple_usernames',
                 default=True,
                 help='Replace "@user @user" with "2 <username_filler>"')
    add_bool_arg(parser,
                 'replace_multiple_urls',
                 default=True,
                 help='Replace "http://... http://.." with "2 <url_filler>"')
    add_bool_arg(parser,
                 'standardize_punctuation',
                 default=True,
                 help='Standardize (asciifyi) special punctuation')
    add_bool_arg(
        parser,
        'remove_unicode_symbols',
        default=True,
        help=
        'After preprocessing remove characters which belong to unicode category "So"'
    )
    add_bool_arg(
        parser,
        'remove_accented_characters',
        default=False,
        help='Remove accents/asciify everything. Probably not recommended.')
    return parser.parse_args()