def get_args(): options = Options() options.add_index_options() args = options.parse() coarse = 'hnsw' if args.hnsw else 'flat' args.index_name = f'{args.num_clusters}_{coarse}_{args.fine_quant}{"_first" if args.first_passage else ""}' if args.index_filter != -1e8: # other than default args.index_name = args.index_name + f'_ft{int(args.index_filter)}' args.index_dir = os.path.join(args.dump_dir, 'start', args.index_name) args.quantizer_path = os.path.join(args.index_dir, args.quantizer_path) args.trained_index_path = os.path.join(args.index_dir, args.trained_index_path) args.inv_path = os.path.join(args.index_dir, args.inv_path) args.subindex_dir = os.path.join(args.index_dir, args.subindex_name) if args.dump_paths is None: args.index_path = os.path.join(args.index_dir, args.index_path) args.idx2id_path = os.path.join(args.index_dir, args.idx2id_path) else: args.dump_paths = [ os.path.join(args.dump_dir, args.phrase_dir, path) for path in args.dump_paths.split(',') ] args.index_path = os.path.join(args.subindex_dir, '%d.faiss' % args.offset) args.idx2id_path = os.path.join(args.subindex_dir, '%d.hdf5' % args.offset) logger.info(f"Creating {args.index_name}...") return args
def __init__(self, load_dir, dump_dir, index_name='start/1048576_flat_OPQ96', device='cuda', verbose=False, **kwargs): print( "This could take up to 15 mins depending on the file reading speed of HDD/SSD" ) # Turn off loggers if not verbose: logging.getLogger("densephrases").setLevel(logging.WARNING) logging.getLogger("transformers").setLevel(logging.WARNING) # Get default options options = Options() options.add_model_options() options.add_index_options() options.add_retrieval_options() options.add_data_options() self.args = options.parse() # Set options self.args.load_dir = load_dir self.args.dump_dir = dump_dir self.args.cache_dir = os.environ['CACHE_DIR'] self.args.index_name = index_name self.args.cuda = True if device == 'cuda' else False self.args.__dict__.update(kwargs) # Load encoder self.set_encoder(load_dir, device) # Load MIPS self.mips = load_phrase_index(self.args, ignore_logging=not verbose) # Others self.truecase = TrueCaser( os.path.join(os.environ['DATA_DIR'], self.args.truecase_path)) print("Loading DensePhrases Completed!")
out_file = os.path.join( os.environ['SAVE_DIR'], os.path.basename(args.load_dir), 'pred', os.path.splitext(os.path.basename(pred_path))[0] + f'_{"sent" if args.return_sent else "psg"}-top{args.psg_top_k}{"_mark" if args.mark_phrase else ""}.json' ) logger.info(f"dump to {out_file}") json.dump(my_target, open(out_file, 'w'), indent=4) # Call subprocess for evaluation command = f'python scripts/postprocess/recall.py --k_values 1,5,20,100 --results_file {out_file} --ans_fn string' subprocess.run(command.split(' ')) if __name__ == '__main__': # See options in densephrases.options options = Options() options.add_model_options() options.add_index_options() options.add_retrieval_options() options.add_data_options() options.add_question_type_options() args = options.parse() # Seed for reproducibility random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) # Set wandb
# Annotate for L_doc if 'doc' in args.label_strat.split(','): p_targets = [[ any(phrase['title'][0].lower() == tit.lower() for tit in title) for phrase in phrase_group ] for phrase_group, title in zip(phrase_groups, titles)] p_targets = [[ii if val else None for ii, val in enumerate(target)] for target in p_targets] return start_vecs, end_vecs, targets, p_targets if __name__ == '__main__': # See options in densephrases.options options = Options() options.add_model_options() options.add_index_options() options.add_retrieval_options() options.add_data_options() options.add_qsft_options() args = options.parse() # Seed for reproducibility random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) if args.run_mode == 'train_query':
def main(): # See options in densephrases.options options = Options() options.add_model_options() options.add_data_options() options.add_rc_options() args = options.parse() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load config, tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() config, unused_kwargs = AutoConfig.from_pretrained( args.config_name if args.config_name else args.pretrained_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, output_hidden_states=False, return_unused_kwargs=True ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.pretrained_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() logger.info("Dump parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` # will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Create phrase vectors if args.do_dump: assert args.load_dir model, tokenizer, config = load_encoder(device, args, phrase_only=True) args.draft = False dump_phrases(args, model, tokenizer, filter_only=args.filter_only)
def main(): # See options in densephrases.options options = Options() options.add_model_options() options.add_data_options() options.add_rc_options() args = options.parse() if args.local_rank != -1: if args.local_rank > 0: logger.setLevel(logging.WARN) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Set wandb if args.do_train or args.do_eval: wandb.init(project="DensePhrases (single)", mode="online" if args.wandb else "disabled") wandb.config.update(args) # Load config, tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Initialize or load encoder model, tokenizer, config = load_encoder(device, args) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` # will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training global_step = 1 tr_loss = 99999 if args.do_train: # Load pre-trained cross encoder if args.lambda_kl > 0 and args.do_train: cross_encoder = torch.load(os.path.join(args.teacher_dir, "pytorch_model.bin"), map_location=torch.device('cpu')) new_qd = { n[len('bert') + 1:]: p for n, p in cross_encoder.items() if 'bert' in n } new_linear = { n[len('qa_outputs') + 1:]: p for n, p in cross_encoder.items() if 'qa_outputs' in n } qd_config, unused_kwargs = AutoConfig.from_pretrained( args.pretrained_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, return_unused_kwargs=True) qd_pretrained = AutoModel.from_pretrained( args.pretrained_name_or_path, config=qd_config, cache_dir=args.cache_dir if args.cache_dir else None, ) model.cross_encoder = qd_pretrained model.cross_encoder.load_state_dict(new_qd) model.qa_outputs = torch.nn.Linear(config.hidden_size, 2) model.qa_outputs.load_state_dict(new_linear) logger.info(f'Distill with teacher model {args.teacher_dir}') # Train model model.to(args.device) train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False, skip_no_answer=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if (args.do_train) and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Remove teacher before saving if args.lambda_kl > 0: del model.cross_encoder del model.qa_outputs logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model.load_state_dict( backward_compat( torch.load(os.path.join(args.output_dir, "pytorch_model.bin"), map_location=torch.device('cpu')))) tokenizer = AutoTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Set load_dir to trained model args.load_dir = args.output_dir logger.info(f'Will load {args.load_dir} that was trained.') # Test filter if args.do_filter_test: assert args.load_dir model, tokenizer, _ = load_encoder(device, args) filter_test(args, model, tokenizer) # Evaluation if args.do_eval and args.local_rank in [-1, 0]: assert args.load_dir model, tokenizer, _ = load_encoder(device, args) result, _ = evaluate(args, model, tokenizer, prefix='final') result = dict((k + "_final", v) for k, v in result.items()) wandb.log( { "Eval EM": result['exact_final'], "Eval F1": result['f1_final'], "loss": tr_loss }, step=global_step, ) logger.info("Results: {}".format(result))