def __init__(self, cfg: DictConfig, save_temp_conf=False): self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0 self.distributed_factor = cfg.distributed_world_size or 1 if save_temp_conf: import pickle with open('/content/drive/MyDrive/conf.pickle', 'wb') as f: pickle.dump(cfg, f) logger.info("***** Initializing components for training *****") # graded dataset settings self.trainer_type = 'binary' if cfg.binary_trainer else 'graded' logger.info(f'trainer_type: {self.trainer_type}') self.relations = cfg.relations logger.info(f'relations: {self.relations}') # if model file is specified, encoder parameters from saved state should be used for initialization model_file = get_model_file(cfg, cfg.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint(model_file) set_cfg_params_from_state(saved_state.encoder_params, cfg) tensorizer, model, optimizer = init_biencoder_components( cfg.encoder.encoder_model_type, cfg ) model, optimizer = setup_for_distributed_mode( model, optimizer, cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16, cfg.fp16_opt_level, ) self.biencoder = model self.optimizer = optimizer self.tensorizer = tensorizer self.start_epoch = 0 self.start_batch = 0 self.scheduler_state = None self.best_validation_result = None self.best_cp_name = None self.cfg = cfg self.ds_cfg = BiencoderDatasetsCfg(cfg) if saved_state: self._load_saved_state(saved_state) self.dev_iterator = None
def __init__(self, cfg: DictConfig): self.cfg = cfg self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0 self.distributed_factor = cfg.distributed_world_size or 1 logger.info("***** Initializing components for training *****") model_file = get_model_file(self.cfg, self.cfg.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint(model_file) set_cfg_params_from_state(saved_state.encoder_params, cfg) gradient_checkpointing = getattr(self.cfg, "gradient_checkpointing", False) tensorizer, reader, optimizer = init_reader_components( cfg.encoder.encoder_model_type, cfg, gradient_checkpointing=gradient_checkpointing, ) reader, optimizer = setup_for_distributed_mode( reader, optimizer, cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16, cfg.fp16_opt_level, gradient_checkpointing=gradient_checkpointing, ) self.reader = reader self.optimizer = optimizer self.tensorizer = tensorizer self.debugging = getattr(self.cfg, "debugging", False) self.wiki_data = None self.dev_iterator = None self.start_epoch = 0 self.start_batch = 0 self.scheduler_state = None self.best_validation_result = None self.best_cp_name = None if saved_state: self._load_saved_state(saved_state)
def __init__(self, cfg: DictConfig): self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0 self.distributed_factor = cfg.distributed_world_size or 1 logger.info("***** Initializing components for training *****") # if model file is specified, encoder parameters from saved state should be used for initialization model_file = get_model_file(cfg, cfg.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint(model_file) set_cfg_params_from_state(saved_state.encoder_params, cfg) tensorizer, model, optimizer = init_biencoder_components( cfg.encoder.encoder_model_type, cfg ) model, optimizer = setup_for_distributed_mode( model, optimizer, cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16, cfg.fp16_opt_level, ) self.biencoder = model self.optimizer = optimizer self.tensorizer = tensorizer self.start_epoch = 0 self.start_batch = 0 self.scheduler_state = None self.best_validation_result = None self.best_cp_name = None self.cfg = cfg self.ds_cfg = BiencoderDatasetsCfg(cfg) if saved_state: self._load_saved_state(saved_state) self.dev_iterator = None
def __init__(self, args): self.args = args self.shard_id = args.local_rank if args.local_rank != -1 else 0 self.distributed_factor = args.distributed_world_size or 1 logger.info("***** Initializing components for training *****") # if model file is specified, encoder parameters from saved state should be used for initialization model_file = get_model_file(self.args, self.args.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint(model_file) set_encoder_params_from_state(saved_state.encoder_params, args) tensorizer, model, optimizer = init_biencoder_components( args.encoder_model_type, args ) model, optimizer = setup_for_distributed_mode( model, optimizer, args.device, args.n_gpu, args.local_rank, args.fp16, args.fp16_opt_level ) self.biencoder = model self.optimizer = optimizer self.tensorizer = tensorizer self.start_epoch = 0 self.start_batch = 0 self.scheduler_state = None self.best_validation_result = None self.best_cp_name = None if saved_state: self._load_saved_state(saved_state)
def __init__(self, cfg: DictConfig): self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0 self.distributed_factor = cfg.distributed_world_size or 1 logger.info("***** Initializing components for training *****") # if model file is specified, encoder parameters from saved state should be used for initialization model_file = get_model_file(cfg, cfg.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint_legacy(model_file) set_cfg_params_from_state(saved_state.encoder_params, cfg) if isinstance(saved_state, CheckpointStateOFA): self.mode = "normal" # Initialize everything gradient_checkpointing = getattr(cfg, "gradient_checkpointing", False) tensorizer, model, biencoder_optimizer, reader_optimizer, forward_fn = init_ofa_model( cfg.encoder.encoder_model_type, cfg, gradient_checkpointing=gradient_checkpointing, ) else: # Only allowed during evaluation-only mode assert isinstance(saved_state, CheckpointState) assert cfg.train_datasets is None or len(cfg.train_datasets) == 0 # Convert from old state to OFA state saved_state, self.mode = convert_from_old_state_to_ofa(saved_state) if self.mode == "biencoder": # Sanity check assert cfg.evaluate_retriever and (not cfg.evaluate_reader) # Initialize everything tensorizer, biencoder, _ = init_biencoder_components( cfg.encoder.encoder_model_type, cfg, inference_only=True, ) reader = None else: # Sanity check assert cfg.evaluate_reader and (not cfg.evaluate_retriever) # Initialize everything tensorizer, reader, _ = init_reader_components( cfg.encoder.encoder_model_type, cfg, inference_only=True, ) biencoder = None # Create a "fake" one-for-all model model = SimpleOneForAllModel( biencoder=biencoder, reader=reader, tensorizer=tensorizer, ) # Modify config cfg.ignore_checkpoint_optimizer = True cfg.ignore_checkpoint_offset = True cfg.gradient_checkpointing = False cfg.fp16 = False # Place holder for backward compatibility gradient_checkpointing = False biencoder_optimizer = None reader_optimizer = None forward_fn = ofa_simple_fw_pass # always the simplest else: self.mode = "normal" # Initialize everything gradient_checkpointing = getattr(cfg, "gradient_checkpointing", False) tensorizer, model, biencoder_optimizer, reader_optimizer, forward_fn = init_ofa_model( cfg.encoder.encoder_model_type, cfg, gradient_checkpointing=gradient_checkpointing, ) model, (biencoder_optimizer, reader_optimizer) = setup_for_distributed_mode( model, [biencoder_optimizer, reader_optimizer], cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16, cfg.fp16_opt_level, gradient_checkpointing=gradient_checkpointing, ) self.forward_fn = forward_fn self.model = model self.cfg = cfg self.ds_cfg = OneForAllDatasetsCfg(cfg) self.biencoder_optimizer = biencoder_optimizer self.biencoder_scheduler_state = None self.reader_optimizer = reader_optimizer self.reader_scheduler_state = None self.clustering = cfg.biencoder.clustering if self.clustering: cfg.global_loss_buf_sz = 72000000 # this requires a lot of memory self.tensorizer = tensorizer self.start_epoch = 0 self.start_batch = 0 self.best_validation_result = None self.best_cp_name = None # Biencoder loss function (note that reader loss is automatically computed) self.biencoder_loss_function: BiEncoderNllLoss = init_loss(cfg.encoder.encoder_model_type, cfg) if saved_state: self._load_saved_state(saved_state) self.dev_iterator = None
def __init__(self, cfg: DictConfig): self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0 self.distributed_factor = cfg.distributed_world_size or 1 logger.info("***** Initializing components for training *****") # if model file is specified, encoder parameters from saved state should be used for initialization model_file = get_model_file(cfg, cfg.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint(model_file) set_cfg_params_from_state(saved_state.encoder_params, cfg) tensorizer, model, optimizer = init_biencoder_components( cfg.encoder.encoder_model_type, cfg ) if cfg.deepspeed: model.half() # XXX #no_decay = ["bias", "LayerNorm.weight"] # #optimizer_grouped_parameters = [ # { # "params": [ # p # for n, p in model.named_parameters() # if not any(nd in n for nd in no_decay) # ], # "weight_decay": cfg.train.weight_decay, # }, # { # "params": [ # p # for n, p in model.named_parameters() # if any(nd in n for nd in no_decay) # ], # "weight_decay": 0.0, # }, #] optimizer = DeepSpeedCPUAdam(optimizer.param_groups, lr=cfg.train.learning_rate, weight_decay=cfg.train.weight_decay) model, optimizer = setup_for_distributed_mode( model, optimizer, cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16, cfg.fp16_opt_level, ) self.biencoder = model self.optimizer = optimizer self.tensorizer = tensorizer self.start_epoch = 0 self.start_batch = 0 self.scheduler_state = None self.best_validation_result = None self.best_cp_name = None self.cfg = cfg self.ds_cfg = BiencoderDatasetsCfg(cfg) if saved_state: self._load_saved_state(saved_state) self.dev_iterator = None