def parse_args(): # Parse commandline parser = ArgParseDefault() parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU') parser.add_argument('--bucket_name', required=True, help='Bucket name') parser.add_argument('--tpu_name', required=False, help='Name of the TPU') parser.add_argument('--tpu_name_project', required=False, help='Name of the TPU project') parser.add_argument('--pretrain_data', required=True, type=str, help='Folder which contains pretrain data. Should be located under gs://{bucket_name}/{project_name}/pretrain/pretrain_data/') parser.add_argument('--run_prefix', help='Prefix to be added to all runs. Useful to group runs') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use') parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket') parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use') parser.add_argument('--eval_steps', default=1000, type=int, help='Number eval steps to run (only active when --do_eval flag is provided)') parser.add_argument('--init_checkpoint', default=None, help='Run name to initialize checkpoint from. Example: "run2/ctl_step_8000.ckpt-8". or "run2/pretrained/bert_model_8000.ckpt-8". The first contains the mlm/nsp layers. \ By default using a pretrained model from gs://{bucket_name}/pretrained_models/') parser.add_argument('--load_mlm_nsp_weights', default=None, help="If set to True it will load the mlm/nsp-layers. The init_checkpoint should then be set to a model containing these. Usually in base run-directory named 'ctl_step*'.") parser.add_argument('--set_trainstep', default=None, help="If set this will set the trainstep. This is only needed when restarting from an old checkpoint and you would like to get the scheduler/optimiser to start at the correct point.") parser.add_argument('--optimizer_type', default='adamw', choices=['adamw', 'lamb'], type=str, help='Optimizer') parser.add_argument('--train_batch_size', default=32, type=int, help='Training batch size') parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size') parser.add_argument('--num_epochs', default=3, type=int, help='Number of epochs') parser.add_argument('--num_steps_per_epoch', default=1000, type=int, help='Number of steps per epoch') parser.add_argument('--warmup_steps', default=10000, type=int, help='Warmup steps') parser.add_argument('--warmup_proportion', default=None, type=float, help='If set overwrites warmup_steps.') parser.add_argument('--learning_rate', default=2e-5, type=float, help='Learning rate') parser.add_argument('--end_lr', default=0, type=float, help='Final learning rate') parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length. Sequences longer than this will be truncated, and sequences shorter than this will be padded.') parser.add_argument('--max_predictions_per_seq', default=14, type=int, help='Maximum predictions per sequence_output.') parser.add_argument('--dtype', default='fp32', choices=['fp32', 'bf16', 'fp16'], type=str, help='Data type') parser.add_argument('--steps_per_loop', default=10, type=int, help='Steps per loop') parser.add_argument('--time_history_log_steps', default=1000, type=int, help='Frequency with which to log timing information with TimeHistory.') add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU') add_bool_arg(parser, 'do_eval', default=False, help='Run evaluation (make sure eval data is present in tfrecords folder)') args = parser.parse_args() return args
def prepare_predict(self): from utils.task_helpers import prepare_predict parser = ArgParseDefault(description='Prepare data for prediction with the text-classification library. \ This function generates two files (1 for text 1 for IDs/created_at) under data/other. The text.csv file can then be predicted.') parser.add_argument('--start_date', required=False, default=None, help='Filter start date') parser.add_argument('--end_date', required=False, default=None, help='Filter end date') add_bool_arg(parser, 'anonymize', default=True, help='Replace usernames and URLs with filler (@user and <url>)') parser.add_argument('--url_filler', required=False, default='<url>', help='Filler for urls (if anonymize)') parser.add_argument('--user_filler', required=False, default='@user', help='Filler for user names (if anonymize)') args = parser.parse_args(sys.argv[2:]) prepare_predict(args)
def parse_args(): # Parse commandline parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_folder', required=True, help='Path to input folder. All files ending with *.txt will be parsed.') parser.add_argument('-o', '--output_folder', required=True, help='Output folder. Will be created if it does not exist') parser.add_argument('-s', '--shards', required=False, default=1, help='Number of shards') parser.add_argument('-n', '--output_name', required=False, default="output", help='Specify the name of the output file') add_bool_arg(parser, 'randomize', default=True, help='Randomizes all articles before segmentation.') add_bool_arg(parser, 'deduplicate', default=True, help='Deduplicates all articles before sentence segmenation.') args = parser.parse_args() return args
def sample(self): import utils.processing.sample_tweets as sample_tweets parser = ArgParseDefault(description='Sample cleaned data to generate `data/2_sampled`') parser.add_argument('-s', '--size', type=int, required=True, dest='size', help='Number of tweets to sample') parser.add_argument('-bs', '--bin_size', type=int, required=False, help='Number of tweets per bin') parser.add_argument('-m', '--mode', choices=['monthly', 'random'], required=False, default='random', help='Sampling mode. Random: Sample randomly. Monthly: Try to sample evenly within months.') parser.add_argument('-l', '--langs', default=[], nargs='+', required=False, help='Filter by language(s)') parser.add_argument('--contains_keywords', default=False, action='store_true', help='Only sample from tweets which include keywords') parser.add_argument('--min_token_count', default=3, type=int, required=False, help='Minimum number of tokens') parser.add_argument('--include_replies', default=False, action='store_true', help='Include replies') parser.add_argument('--seed', type=int, required=False, default=None, help='Random state split') parser.add_argument('--extend', action='store_true', help='Extending existing sample given by seed by removing already labelled tweets. If size is <= original sample size this has no effect except removing labelled tweets'); add_bool_arg(parser, 'anonymize', default=True, help='Replace usernames and URLs with filler (@user and <url>)') parser.add_argument('--max_date', required=False, default=None, help='Sample until date (YYYY-MM-DD), default: No max') parser.add_argument('--min_date', required=False, default=None, help='Sample from date (YYYY-MM-DD), default: No min') args = parser.parse_args(sys.argv[2:]) sample_tweets.run(size=args.size, contains_keywords=args.contains_keywords, anonymize=args.anonymize, min_token_count=args.min_token_count, langs=args.langs, include_replies=args.include_replies, mode=args.mode, seed=args.seed, extend=args.extend, bin_size=args.bin_size, min_date=args.min_date, max_date=args.max_date)
def parse_args(): # Parse commandline parser = ArgParseDefault(usage=USAGE_DESCRIPTION) parser.add_argument('--run_name', required=True, help='Finetune run name. The model will be loaded from gs://{bucket_name}/{project_name}/finetune/runs/{run_name}.') parser.add_argument('--bucket_name', required=True, help='Bucket name') parser.add_argument('--project_name', required=False, default='covid-bert', help='Name of subfolder in Google bucket') parser.add_argument('--input_text', required=False, help='Predict arbitrary input text and print prediction to stdout') parser.add_argument('--input_txt_files', nargs='+', required=False, help='Predict text from local txt files. One example per line.') parser.add_argument('--input_tfrecord_files', nargs='+', required=False, help='Predict text from tfrecord files (local or on bucket).') parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use') parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use') parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size') parser.add_argument('--label_name', default='label', type=str, help='Name of label to predicted') add_bool_arg(parser, 'interactive_mode', default=False, help='Interactive mode') add_bool_arg(parser, 'use_tpu', default=False, help='Use TPU (only works when using input_tfrecord_files stored on a Google bucket)') args = parser.parse_args() return args
def parse_args(): # Parse commandline parser = ArgParseDefault() parser.add_argument('--finetune_data', required=True, help='Finetune data folder sub path. Path has to be in gs://{bucket_name}/{project_name}/finetune/finetune_data/{finetune_data}.\ This folder includes a meta.json (containing meta info about the dataset), and a file label_mapping.json. \ TFrecord files (train.tfrecords and dev.tfrecords) should be located in a \ subfolder gs://{bucket_name}/{project_name}/finetune/finetune_data/{finetune_data}/tfrecords/') parser.add_argument('--bucket_name', required=True, help='Bucket name') parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU') parser.add_argument('--preemptible_tpu', default=False, action='store_true', required=False, help='Dynamically create preemptible TPU (this requires you to have glcoud installed with suitable permissions)') parser.add_argument('--preemptible_tpu_zone', default='us-central1-f', type=str, required=False, help='Preemptible TPU zone (only if --preemptible_tpu flag is provided)') parser.add_argument('--preemptible_tpu_name', default=None, type=str, required=False, help='Preemptible TPU name (only if --preemptible_tpu flag is provided)') parser.add_argument('--preemptible_tpu_version', default='nightly', choices=['nightly', '2.1'], type=str, required=False, help='Preemptible TPU version (only if --preemptible_tpu flag is provided)') parser.add_argument('--run_prefix', help='Prefix to be added to all runs. Useful to group runs') parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use') parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use') parser.add_argument('--init_checkpoint', default=None, help='Run name to initialize checkpoint from. Example: "run2/ctl_step_8000.ckpt-8". \ By default using a pretrained model from gs://{bucket_name}/pretrained_models/') parser.add_argument('--init_checkpoint_index', type=int, help='Checkpoint index. This argument is ignored and only added for reporting.') parser.add_argument('--repeats', default=1, type=int, help='Number of times the script should run. Default is 1') parser.add_argument('--num_epochs', default=3, type=int, help='Number of epochs') parser.add_argument('--limit_train_steps', type=int, help='Limit the number of train steps per epoch. Useful for testing.') parser.add_argument('--limit_eval_steps', type=int, help='Limit the number of eval steps per epoch. Useful for testing.') parser.add_argument('--train_batch_size', default=32, type=int, help='Training batch size') parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size') parser.add_argument('--learning_rate', default=2e-5, type=float, help='Learning rate') parser.add_argument('--end_lr', default=0, type=float, help='Final learning rate') parser.add_argument('--warmup_proportion', default=0.1, type=float, help='Learning rate warmup proportion') parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length') parser.add_argument('--early_stopping_epochs', default=-1, type=int, help='Stop when loss hasn\'t decreased during n epochs') parser.add_argument('--optimizer_type', default='adamw', choices=['adamw', 'lamb'], type=str, help='Optimizer') parser.add_argument('--dtype', default='fp32', choices=['fp32', 'bf16', 'fp16'], type=str, help='Data type') parser.add_argument('--steps_per_loop', default=10, type=int, help='Steps per loop (unavailable for Keras fit in TF 2.2, will be added in later version)') parser.add_argument('--time_history_log_steps', default=10, type=int, help='Frequency with which to log timing information with TimeHistory.') add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU') args = parser.parse_args() return args
def parse(self): import utils.processing.parse_tweets as parse_tweets parser = ArgParseDefault(description='Preprocess raw data to create parquet files in `data/1_parsed`') parser.add_argument('--no-parallel', dest='no_parallel', action='store_true', default=False, help='Do not run in parallel') parser.add_argument('--extend', dest='extend', action='store_true', default=False, help='Extend existing parsed data') parser.add_argument('--ray_num_cpus', type=int, default=None, help='Limit the number of worker processes for Ray during the memory intensive merge phase (by default using maximum worker processes)') add_bool_arg(parser, 'extract_retweets', default=True, help='Extract top-level retweets') add_bool_arg(parser, 'extract_quotes', default=True, help='Extract top-level quotes') add_bool_arg(parser, 'omit_last_day', default=True, help='Omit parsing data from the last day') args = parser.parse_args(sys.argv[2:]) parse_tweets.run(no_parallel=args.no_parallel, extract_retweets=args.extract_retweets, extract_quotes=args.extract_quotes, extend=args.extend, omit_last_day=args.omit_last_day, ray_num_cpus=args.ray_num_cpus)
def parse_args(): # Parse commandline parser = argparse.ArgumentParser() parser.add_argument( '-i', '--input_folder', required=True, help='Path to input folder. All files ending with *.txt will be parsed.' ) parser.add_argument('-o', '--output_file', required=True, help='Output file. Will overwrite it exists') parser.add_argument( '-s', '--min_ocr_date', required=False, default='01-01-2009', help='Will drop all articles that was ocr-ed prior to this date') parser.add_argument( '-p', '--min_publish_year', required=False, default='1814', help='Will drop all articles published prior to this year') parser.add_argument( '-l', '--language', required=False, default='', help='If set, only articles in this language will be included') parser.add_argument('-L', '--unknown_language', required=False, default='nob', help='Any unknown language is set to this value') parser.add_argument('-y', '--unknown_year', required=False, default='1900', help='Any unknown year is set to this value') parser.add_argument( '-C', '--min_confidence_article', required=False, default='0.9', help='Will drop all articles with lower average word confidence') parser.add_argument( '-c', '--min_confidence_paragraph', required=False, default='0.8', help='Will drop all paragraphs with lower average word confidence') parser.add_argument( '-a', '--min_words_paragraph', required=False, default='5.0', help= 'Minimum average number of words per paragraph in the entire article/book' ) parser.add_argument('-w', '--min_words_article', required=False, default='20', help='Minimum words in the entire article/book') add_bool_arg(parser, 'debug', default=False, help='Print debug info about paragraphs.') add_bool_arg( parser, 'clean', default=False, help='Run precedure for cleaning text. Specified in sub-routine.') args = parser.parse_args() return args
def parse_args(): parser = ArgParseDefault() parser.add_argument('--input_data', required=True, help='Path to folder with txt files. \ Folder may contain train/dev/test subfolders. Each txt file contains the text of a single tweet per line.' ) parser.add_argument( '--run_prefix', help='Prefix to be added to all runs. Useful to identify runs') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class') parser.add_argument('--username_filler', default='twitteruser', type=str, help='Username filler') parser.add_argument( '--url_filler', default='twitterurl', type=str, help='URL filler (ignored when replace_urls option is false)') parser.add_argument('--num_logged_samples', default=10, type=int, help='Log first n samples to output') add_bool_arg(parser, 'run_in_parallel', default=True, help='Run script in parallel') add_bool_arg(parser, 'replace_usernames', default=True, help='Replace usernames with filler') add_bool_arg(parser, 'replace_urls', default=True, help='Replace URLs with filler') add_bool_arg(parser, 'asciify_emojis', default=True, help='Asciifyi emojis') add_bool_arg(parser, 'replace_multiple_usernames', default=True, help='Replace "@user @user" with "2 <username_filler>"') add_bool_arg(parser, 'replace_multiple_urls', default=True, help='Replace "http://... http://.." with "2 <url_filler>"') add_bool_arg( parser, 'remove_unicode_symbols', default=True, help= 'After preprocessing remove characters which belong to unicode category "So"' ) add_bool_arg( parser, 'remove_accented_characters', default=False, help='Remove accents/asciify everything. Probably not recommended.') add_bool_arg(parser, 'standardize_punctuation', default=True, help='Standardize (asciifyi) special punctuation') args = parser.parse_args() return args
def parse_args(): parser = ArgParseDefault() parser.add_argument('--input_file', required=True, help='Path to input file.') parser.add_argument('--output_file', required=True, help='Path to output file.') parser.add_argument('--username_filler', default='@user', type=str, help='Username filler (ignored when replace_username option is false)') parser.add_argument('--url_filler', default='http://domain.com', type=str, help='URL filler (ignored when replace_urls option is false)') parser.add_argument('--email_filler', default='*****@*****.**', type=str, help='Email filler (ignored when replace_email option is false)') parser.add_argument('--digibok', default='keep', type=str, help='Handling of digibok_ids. "keep", "remove" or "auto". Last option relies on other settings in script') parser.add_argument('--min_alphawords', default=2, type=int, help='The minimum number of letter-only- words with a length of at least 2. Keeps empty lines.') parser.add_argument('--max_words_in_section', required=False, default=1000, help='After reaching this maximum number of words, the next paragraph will be split into a new section.') #parser.add_argument('--num_logged_samples', default=10, type=int, help='Log first n samples to output') #add_bool_arg(parser, 'run_in_parallel', default=True, help='Run script in parallel') add_bool_arg(parser, 'replace_usernames', default=False, help='Replace usernames with filler. Mainly for tweets') add_bool_arg(parser, 'replace_urls', default=False, help='Replace URLs with filler') add_bool_arg(parser, 'replace_email', default=True, help='Replace emails with filler') add_bool_arg(parser, 'fix_unicode', default=True, help='Use ftfy to fix and standardise unicode. Converts it all to valid utf-8') add_bool_arg(parser, 'asciify_emojis', default=False, help='Asciifyi emojis. On by default but mainly useful for social media') add_bool_arg(parser, 'replace_multiple_usernames', default=False, help='Replace "@user @user" with "2 <username_filler>. Mainly for use on tweets"') add_bool_arg(parser, 'standardize', default=True, help='Replace "Standardize text. Remove all control characters.') add_bool_arg(parser, 'replace_multiple_urls', default=False, help='Replace "http://... http://.." with "2 <url_filler>". Mainly for use on tweets') add_bool_arg(parser, 'remove_unicode_symbols', default=True, help='After preprocessing remove characters which belong to unicode category "So"') add_bool_arg(parser, 'remove_accented_characters', default=False, help='Remove accents/asciify everything. Probably not recommended.') add_bool_arg(parser, 'standardize_punctuation', default=True, help='Standardize (asciifyi) special punctuation') add_bool_arg(parser, 'do_lower_case', default=False, help='Convert text to lower case') args = parser.parse_args() return args
def parse_args(): parser = ArgParseDefault() parser.add_argument('--input_txt_files', type=str, nargs='+', help='Input txt files to process.') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class') parser.add_argument('--run_prefix', help='Run prefix') parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length') parser.add_argument('--username_filler', default='twitteruser', type=str, help='Username filler') parser.add_argument( '--url_filler', default='twitterurl', type=str, help='URL filler (ignored when replace_urls option is false)') add_bool_arg(parser, 'replace_usernames', default=True, help='Replace usernames with filler') add_bool_arg(parser, 'replace_urls', default=True, help='Replace URLs with filler') add_bool_arg(parser, 'asciify_emojis', default=True, help='Asciifyi emojis') add_bool_arg(parser, 'replace_multiple_usernames', default=True, help='Replace "@user @user" with "2 <username_filler>"') add_bool_arg(parser, 'replace_multiple_urls', default=True, help='Replace "http://... http://.." with "2 <url_filler>"') add_bool_arg( parser, 'remove_unicode_symbols', default=True, help= 'After preprocessing remove characters which belong to unicode category "So"' ) add_bool_arg(parser, 'standardize_punctuation', default=True, help='Standardize (asciifyi) special punctuation') add_bool_arg( parser, 'remove_accented_characters', default=False, help='Remove accents/asciify everything. Probably not recommended.') add_bool_arg(parser, 'write_preprocessed_file', default=True, help='Write preprocess output file') add_bool_arg(parser, 'run_in_parallel', default=True, help='Run script in parallel') return parser.parse_args()
def parse_args(): parser = ArgParseDefault() parser.add_argument( '--run_name', required=True, help= 'Run name to create tf record files for. Run folder has to be located under \ data/pretrain/{run_name}/preprocessed/ and must contain one or multiple txt files. May also contain train and dev subfolders with txt files.' ) parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use') parser.add_argument( '--dupe_factor', default=10, type=int, help= 'Number of times to duplicate the input data (with different masks).') parser.add_argument( '--short_seq_prob', default=0.1, type=float, help= 'Probability of creating sequences which are shorter than the maximum length.' ) parser.add_argument( '--max_predictions_per_seq', default=14, type=int, help='Maximum number of masked LM predictions per sequence.') parser.add_argument('--random_seed', default=42, type=int, help='Random seed') parser.add_argument('--masked_lm_prob', default=0.15, type=float, help='Masked LM probabibility') parser.add_argument('--gzipped', action='store_true', default=False, help='Create gzipped tfrecords files') parser.add_argument('--num_logged_samples', default=10, type=int, help='Log first n samples to output') parser.add_argument( '--max_num_cpus', default=10, type=int, help= 'Adapt this number based on the available memory/size of input files. \ This code was tested on a machine with a lot of memory (250GB). Decrease this number if you run into memory issues.' ) add_bool_arg(parser, 'run_in_parallel', default=True, help='Run script in parallel') return parser.parse_args()
def parse_args(): # Parse commandline parser = ArgParseDefault() parser.add_argument('--tpu_ip', required=True, help='IP-address of the TPU') parser.add_argument('--bucket_name', required=True, help='Bucket name') parser.add_argument('--tpu_name', required=False, help='Name of the TPU') parser.add_argument('--tpu_name_project', required=False, help='Name of the TPU project') parser.add_argument( '--pretrain_data', required=True, type=str, help= 'Folder which contains pretrain data. Should be located under gs://{bucket_name}/{project_name}/pretrain/pretrain_data/' ) parser.add_argument( '--run_prefix', help='Prefix to be added to all runs. Useful to group runs') parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use') parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket') parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use') parser.add_argument( '--eval_steps', default=1000, type=int, help= 'Number eval steps to run (only active when --do_eval flag is provided)' ) parser.add_argument('--optimizer_type', default='adamw', choices=['adamw', 'lamb'], type=str, help='Optimizer') parser.add_argument('--train_batch_size', default=32, type=int, help='Training batch size') parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size') parser.add_argument('--num_epochs', default=3, type=int, help='Number of epochs') parser.add_argument('--num_steps_per_epoch', default=1000, type=int, help='Number of steps per epoch') parser.add_argument('--warmup_steps', default=10000, type=int, help='Warmup steps') parser.add_argument('--learning_rate', default=2e-5, type=float, help='Learning rate') parser.add_argument('--end_lr', default=0, type=float, help='Final learning rate') parser.add_argument( '--max_seq_length', default=96, type=int, help= 'Maximum sequence length. Sequences longer than this will be truncated, and sequences shorter than this will be padded.' ) parser.add_argument('--max_predictions_per_seq', default=14, type=int, help='Maximum predictions per sequence_output.') parser.add_argument('--dtype', default='fp32', choices=['fp32', 'bf16', 'fp16'], type=str, help='Data type') parser.add_argument('--steps_per_loop', default=10, type=int, help='Steps per loop') parser.add_argument( '--time_history_log_steps', default=1000, type=int, help='Frequency with which to log timing information with TimeHistory.' ) add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU') add_bool_arg( parser, 'do_eval', default=False, help= 'Run evaluation (make sure eval data is present in tfrecords folder)') args = parser.parse_args() return args
def parse_args(): parser = ArgParseDefault() parser.add_argument( '--finetune_datasets', type=str, nargs='+', help= 'Finetune dataset(s) to process. These correspond to folder names in data/finetune. \ Data should be located in data/finetune/originals/{finetune_dataset}/[train.tsv/dev.tsv/test.tsv]. By default runs all datasets.' ) parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class') parser.add_argument( '--run_prefix', help='Prefix to be added to all runs. Useful to identify runs') parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length') parser.add_argument('--username_filler', default='twitteruser', type=str, help='Username filler') parser.add_argument( '--url_filler', default='twitterurl', type=str, help='URL filler (ignored when replace_urls option is false)') add_bool_arg(parser, 'replace_usernames', default=True, help='Replace usernames with filler') add_bool_arg(parser, 'replace_urls', default=True, help='Replace URLs with filler') add_bool_arg(parser, 'asciify_emojis', default=True, help='Asciifyi emojis') add_bool_arg(parser, 'replace_multiple_usernames', default=True, help='Replace "@user @user" with "2 <username_filler>"') add_bool_arg(parser, 'replace_multiple_urls', default=True, help='Replace "http://... http://.." with "2 <url_filler>"') add_bool_arg(parser, 'standardize_punctuation', default=True, help='Standardize (asciifyi) special punctuation') add_bool_arg( parser, 'remove_unicode_symbols', default=True, help= 'After preprocessing remove characters which belong to unicode category "So"' ) add_bool_arg( parser, 'remove_accented_characters', default=False, help='Remove accents/asciify everything. Probably not recommended.') return parser.parse_args()