def main(): parser = argparse.ArgumentParser() parser.add_argument( "--out_data_dir", default="/webdata-nfs/jialliu/dpr/ann/ann_multi_data_256/", type=str, help="The output data dir", ) parser.add_argument( "--model_type", default="dpr", type=str, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default="bert-base-uncased", type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_type", default=0, type=int, help="0 is nq, 1 is trivia, 2 is both", ) parser.add_argument( "--question_dir", type=str, help="location of the raw QnA question data", ) parser.add_argument( "--wiki_dir", type=str, help="location of the wiki corpus", ) parser.add_argument( "--answer_dir", type=str, help="location of the QnA answers for evaluation", ) args = parser.parse_args() if not os.path.exists(args.out_data_dir): os.makedirs(args.out_data_dir) preprocess(args)
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--training_dir", default=None, type=str, required=True, help="Training dir, will look for latest checkpoint dir in here", ) parser.add_argument( "--init_model_dir", default=None, type=str, required=True, help= "Initial model dir, will use this if no checkpoint is found in model_dir", ) parser.add_argument( "--last_checkpoint_dir", default="", type=str, help= "Last checkpoint used, this is for rerunning this script when some ann data is already generated", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the training data will be written", ) parser.add_argument( "--cache_dir", default=None, type=str, required=True, help="The directory where cached data will be written", ) parser.add_argument( "--end_output_num", default=-1, type=int, help= "Stop after this number of data versions has been generated, default run forever", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--per_gpu_eval_batch_size", default=128, type=int, help="The starting output file number", ) parser.add_argument( "--ann_chunk_factor", default=5, # for 500k queryes, divided into 100k chunks for each epoch type=int, help="devide training queries into chunks", ) parser.add_argument( "--topk_training", default=500, type=int, help="top k from which negative samples are collected", ) parser.add_argument( "--negative_sample", default=5, type=int, help="at each resample, how many negative samples per query do I use", ) parser.add_argument( "--ann_measure_topk_mrr", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--only_keep_latest_embedding_file", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--no_cuda", action="store_true", help="Avoid using CUDA when available", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument( "--server_ip", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--inference", default=False, action="store_true", help="only do inference if specify", ) args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the cached passage and query files", ) parser.add_argument( "--ann_dir", default=None, type=str, required=True, help= "The ann training data dir. Should contain the output of ann data generation job", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--num_epoch", default=0, type=int, help= "Number of epoch to train, if specified will use training data instead of ann", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--triplet", default=False, action="store_true", help="Whether to run training.") parser.add_argument( "--log_dir", default=None, type=str, help="Tensorboard log dir", ) parser.add_argument( "--optimizer", default="adamW", type=str, help="Optimizer - lamb or adamW", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=2.0, type=float, help="Max gradient norm.") parser.add_argument( "--max_steps", default=300000, type=int, help="If > 0: set total number of training steps to perform", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) # ----------------- ANN HyperParam ------------------ parser.add_argument( "--load_optimizer_scheduler", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--single_warmup", default=True, action="store_true", help="use single or re-warmup", ) # ----------------- End of Doc Ranking HyperParam ------------------ parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir", ) parser.add_argument( "--out_data_dir", default=None, type=str, required=True, help="The output data dir", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--data_type", default=0, type=int, help="0 for doc, 1 for passage", ) args = parser.parse_args() return args