def set_eval_inference_latency_mode(self): """ Evaluate Inference Latency Mode - Pipeline 1. read raw_data (DataReader) 2. load vocabs from checkpoint (DataReader, Token) 3. define raw_to_tensor_fn (DataReader, Token) 4. define and load model 5. run! """ data_reader, token_makers = self._create_data_and_token_makers() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) _, helpers = data_reader.read() raw_examples = helpers["valid"]["examples"] cuda_device = self.config.cuda_devices[ 0] if self.config.use_gpu else None raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=cuda_device) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) return raw_examples, raw_to_tensor_fn
def test(config): NSML_SESSEION = 'team_6/19_tcls_qa/80' # NOTE: need to hard code NSML_CHECKPOINT = '13800' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" set_global_seed(config.seed_num) token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers data_reader = create_by_factory(DataReaderFactory, config.data_reader) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device) trainer = Trainer(model, metric_key="f1") if nsml.IS_ON_NSML: bind_nsml(model, trainer=trainer, raw_to_tensor_fn=raw_to_tensor_fn) if config.nsml.pause: nsml.paused(scope=locals())
def set_predict_mode(self, preload=False): """ Predict Mode - Pipeline 1. read raw_data (Argument) 2. load vocabs from checkpoint (DataReader, Token) 3. define raw_to_tensor_fn (DataReader, Token) 4. define and load model 5. run! """ data_reader, token_makers = self._create_data_and_token_makers() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) # Set predict config if self.argument.interactive: raw_features = { feature_name: "" for feature_name in data_reader.text_columns } else: raw_features = {} for feature_name in data_reader.text_columns: feature = getattr(self.argument, feature_name, None) # if feature is None: # raise ValueError(f"--{feature_name} argument is required!") raw_features[feature_name] = feature cuda_device = self.config.cuda_devices[ 0] if self.config.use_gpu else None raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=cuda_device, helper=self.model_checkpoint.get("predict_helper", {})) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) arguments = vars(self.argument) if preload: self.predict_settings = { "raw_to_tensor_fn": raw_to_tensor_fn, "arguments": arguments } else: return raw_features, raw_to_tensor_fn, arguments
def set_eval_mode(self): """ Evaluate Mode - Pipeline 1. read raw_data (DataReader) 2. load vocabs from checkpoint (DataReader, Token) 3. indexing tokens (DataReader, Token) 4. convert to DataSet (DataReader) 5. create DataLoader (DataLoader) 6. define and load model 7. run! """ data_reader, token_makers = self._create_data_and_token_makers() # DataReader datas, helpers = data_reader.read() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) text_handler.index(datas, data_reader.text_columns) # iterator vocab = vocabs[next(iter(vocabs))] datasets = data_reader.convert_to_dataset(datas, vocab, helpers=helpers) # with name self.config.iterator.cuda_devices = self.config.cuda_devices _, valid_loader, _ = self._create_by_factory( DataLoaderFactory, self.config.iterator, param={"datasets": datasets}) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) return valid_loader
def train_and_evaluate(config): token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers if nsml.IS_ON_NSML: config.data_reader.train_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.train_file_path) config.data_reader.valid_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.valid_file_path) data_reader = create_by_factory(DataReaderFactory, config.data_reader) datas, helpers = data_reader.read() # Vocab & Indexing text_handler = TextHandler(token_makers, lazy_indexing=True) texts = data_reader.filter_texts(datas) token_counters = text_handler.make_token_counters(texts) text_handler.build_vocabs(token_counters) text_handler.index(datas, data_reader.text_columns) # Iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) train_loader = create_data_loader(datasets["train"], batch_size=config.iterator.batch_size, shuffle=True, cuda_device_id=device) valid_loader = create_data_loader(datasets["valid"], batch_size=config.iterator.batch_size, shuffle=False, cuda_device_id=device) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device, helpers=helpers) model_parameters = [ param for param in model.parameters() if param.requires_grad ] optimizer = get_optimizer_by_name("adam")(model_parameters) if IS_ON_NSML: bind_nsml(model, optimizer=optimizer) # Trainer trainer_config = vars(config.trainer) trainer_config["model"] = model trainer = Trainer(**trainer_config) trainer.train_and_evaluate(train_loader, valid_loader, optimizer)
def set_train_mode(self): """ Training Mode - Pipeline 1. read raw_data (DataReader) 2. build vocabs (DataReader, Token) 3. indexing tokens (DataReader, Token) 4. convert to DataSet (DataReader) 5. create DataLoader (DataLoader) 6. define model and optimizer 7. run! """ logger.info("Config. \n" + pretty_json_dumps(self.config_dict) + "\n") data_reader, token_makers = self._create_data_and_token_makers() datas, helpers = data_reader.read() # Token & Vocab text_handler = TextHandler(token_makers, lazy_indexing=True) texts = data_reader.filter_texts(datas) token_counters = text_handler.make_token_counters(texts, config=self.config) text_handler.build_vocabs(token_counters) text_handler.index(datas, data_reader.text_columns) # iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) # with name self.config.iterator.cuda_devices = self.config.cuda_devices train_loader, valid_loader, test_loader = self._create_by_factory( DataLoaderFactory, self.config.iterator, param={"datasets": datasets} ) checkpoint_dir = Path(self.config.trainer.log_dir) / "checkpoint" checkpoints = None if checkpoint_dir.exists(): checkpoints = self._load_exist_checkpoints(checkpoint_dir) # contain model and optimizer if checkpoints is None: model = self._create_model(token_makers, helpers=helpers) op_dict = self._create_by_factory( OptimizerFactory, self.config.optimizer, param={"model": model} ) else: model = self._create_model(token_makers, checkpoint=checkpoints) op_dict = self._create_by_factory( OptimizerFactory, self.config.optimizer, param={"model": model} ) utils.load_optimizer_checkpoint(op_dict["optimizer"], checkpoints) self.set_trainer(model, op_dict=op_dict) return train_loader, valid_loader, op_dict["optimizer"]
def re_train_and_evaluate(config): NSML_SESSEION = 'team_6/19_tcls_qa/258' # NOTE: need to hard code NSML_CHECKPOINT = '1' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers if nsml.IS_ON_NSML: config.data_reader.train_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.train_file_path) config.data_reader.valid_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.valid_file_path) data_reader = create_by_factory(DataReaderFactory, config.data_reader) datas, helpers = data_reader.read() # Vocab & Indexing text_handler = TextHandler(token_makers, lazy_indexing=True) texts = data_reader.filter_texts(datas) token_counters = text_handler.make_token_counters(texts) text_handler.build_vocabs(token_counters) text_handler.index(datas, data_reader.text_columns) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) train_loader = create_data_loader(datasets["train"], batch_size=config.iterator.batch_size, shuffle=True, cuda_device_id=device) valid_loader = create_data_loader(datasets["valid"], batch_size=config.iterator.batch_size, shuffle=False, cuda_device_id=device) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device, helpers=helpers) model_parameters = [ param for param in model.parameters() if param.requires_grad ] optimizer = get_optimizer_by_name("adam")(model_parameters) def bind_load_model(config, model, **kwargs): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["weights"]) model.config = checkpoint["config"] model.metrics = checkpoint["metrics"] model.init_params = checkpoint["init_params"], model.predict_helper = checkpoint["predict_helper"], model.train_counter = TrainCounter(display_unit=1000) # model.vocabs = load_vocabs(checkpoint) if "optimizer" in kwargs: kwargs["optimizer"].load_state_dict(checkpoint["optimizer"][0]) print(f"Model reload checkpoints...! {checkpoint_path}") nsml.bind(load=load) bind_load_model(config, model, optimizer=optimizer) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) if IS_ON_NSML: bind_nsml(model, optimizer=optimizer) # Trainer trainer_config = vars(config.trainer) trainer_config["model"] = model trainer = Trainer(**trainer_config) trainer.train_and_evaluate(train_loader, valid_loader, optimizer)