def preprocess_data(dataset_name: str, split_name: str, nr_samples: int, tokenizer_name) -> Dict: data_set_dir = os.path.join(DATA_DIR, dataset_name) assert check_make_dir(data_set_dir), f"Data set '{dataset_name}' \ not directory '{DATA_DIR}'. \ Please store data there!" tensor_dir = os.path.join( data_set_dir, tokenizer_name ) try: assert check_make_dir(tensor_dir) and os.listdir(tensor_dir) except Exception: tensor_dir += "_filtered" assert (check_make_dir(tensor_dir) and os.listdir(tensor_dir)), f"Neither '{tensor_dir.rstrip('_filtered')} not '{tensor_dir}' does exist or it is empty!" source_path = os.path.join(tensor_dir, f"{split_name}_source.pt") target_path = os.path.join(tensor_dir, f"{split_name}_target.pt") assert os.path.isfile(source_path) and os.path.isfile(target_path), f"Data pair '{source_path}' or '{target_path}' does not exist!" data_dict = { "source": torch.load(open(source_path, "rb")), "target": torch.load(open(target_path, "rb")) } return limit_data(data_dict, nr_samples)
def saveEmbeddingsOfDatatype(self, datasetPath: str, subsetType: str, dataType: str, savePath: str): saveFilePath = join(savePath, f'{subsetType}_{dataType}.pt') embeddings = self.embedDataSubsetOfDatatype(datasetPath, subsetType, dataType) io_utils.check_make_dir(savePath, create_dir=True) torch.save(embeddings, open(saveFilePath, 'wb'))
def save_data_frame(data_frame: pd.DataFrame, output_path: str, file_format: Optional[str] = "csv"): # check if output directory exists out_dir = os.path.dirname(output_path) check_make_dir(out_dir, create_dir=True) if file_format == "csv": output_path += ".csv" data_frame.to_csv(output_path, sep=";") else: with pd.ExcelWriter(output_path) as writer: data_frame.to_excel(writer, "Overview")
def __init__(self, model_dir: str, language: str, status: Optional[str] = 'base'): """set arguments to initialize the model used for summarization Args: model_dir (str): direction to load/store model language (str): supported language status (Optional[str], optional): sets if model is already fine-tuned or not. Defaults to 'base'. """ self.model_path = model_dir assert language in ["english", "german"], \ f"{language} is not a supported language!" self.language = language # available models # t5-base: for english texts # tWikinewsSum/t5-base-multi-de-wiki-news: for german texts if language == "english": self.model_name = 't5-base' self.short_name = 't5' elif language == "german": self.model_name = 'WikinewsSum/t5-base-multi-de-wiki-news' self.short_name = 't5-de' assert status in ['base', 'fine-tuned'] self.status = status if self.status != 'base': assert check_make_dir(self.model_path), \ f"Directory '{self.model_path}' doesn't exist! \ Please follow this folder structure." # initialize the model and tokenizer # based on parameters self.model, self.tokenizer = self.initialize_model() # init the spacy language model # for post processing output if language == "english": self.nlp = spacy.load("en") else: self.nlp = spacy.load("de") if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu")
def provide_data(dataset_name: str, tokenizer_name: str, model_name: str, size: Optional[int] = None, create_splits: Optional[bool] = False, splits2tokenize: Optional[list] = SPLIT_NAMES, filtering: Optional[bool] = True): """Provides tokenized data for training Args: dataset_name (str): foldername in datasets directory tokenizer_name (str): huggingface tokenizer name (same as model name) model_name (str): huggingface model name size (Optional[int], optional): {Limits the amount of samples that are taken for tokenization for each split. create_splits (Optional[bool], optional): Split the dataset into train, validation and test splits. Has to be provided as a dict containing the keys `train` and `val` and values between 0 and 1. If `True` uses a default 80/10/10 split. Defaults to False. splits2tokenize (Optional[list], optional): Can be set to only tokenize certain splits. Defaults to SPLIT_NAMES. filtering (Optional[bool], optional): Longer examples than the maximum token size are filtered, else they are truncated. Defaults to True. Raises: ValueError: incorrect inputs IOError: incompatible text and summary number""" # checking input if not model_name in MODEL_NAMES: raise ValueError('unkown model') if not tokenizer_name in TOKENIZER_NAMES: raise ValueError('unkown tokenizer') if size and size < 1: raise ValueError('wrong size') dataset_dir = f'dataProvider/datasets/{dataset_name}/' assertDirExistent(dataset_dir) if create_splits: if create_splits == True: create_splits = {'train': 0.8, 'val': 0.5} for split_key in create_splits: if not split_key in SPLIT_NAMES: raise ValueError( f'unkown key {split_key} - createSplits has to be a \ dictionary containing the keys `train` and `val` \ and values between 0 and 1') data = {} data['source'] = read_single_txt(dataset_dir + 'sources.txt') data['target'] = read_single_txt(dataset_dir + 'targets.txt') entries = len(data['source']) assert entries == len( data['target'] ), "Source and target must have the same amount of lines" for text_name in ['source', 'target']: text = data[text_name] previous_split_index = 0 create_splits['test'] = 1. for split_name in SPLIT_NAMES: split_fraction = create_splits[split_name] if not 0 <= split_fraction <= 1: # check split values raise ValueError('incorrect split sizes') split_index = int((entries - previous_split_index) * split_fraction + previous_split_index) split = text[previous_split_index:split_index] if len(split): write_txt( '{}{}.{}'.format(dataset_dir, split_name, text_name), split) previous_split_index = split_index assert previous_split_index == entries, f'{previous_split_index} != {entries}' # tokenizing tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name) max_token_size = tokenizer.max_model_input_sizes[model_name] if filtering: filtered = '_filtered' else: filtered = '' tensor_dir = f'{dataset_dir}{tokenizer_name}{filtered}/' check_make_dir(tensor_dir, True) for split_name in splits2tokenize: source = read_single_txt('{}{}.{}'.format(dataset_dir, split_name, 'source')) target = read_single_txt('{}{}.{}'.format(dataset_dir, split_name, 'target')) text_length = len(source) assert text_length == len(target) assert text_length > 0, f"split {split_name} has no entries" if size: # optional limitation of samples for tokenization source = source[:size] target = target[:size] log(f'tokenizing target batch for {split_name} of {text_length} samples' ) if filtering: target_tokens = tokenizer(target, padding=True) else: target_tokens = tokenizer(target, padding=True, return_tensors="pt") if len(target_tokens['attention_mask'][0]) > max_token_size: target_tokens = len(target_tokens['attention_mask'][0]) raise IOError( f'target contains more than {max_token_size} tokens: {target_tokens}' ) log(f'tokenizing source batch for {split_name}') if filtering: source_tokens = tokenizer(source, padding='max_length', truncation=True, max_length=max_token_size + 1) else: source_tokens = tokenizer(source, padding='max_length', truncation=True, return_tensors='pt') if filtering: # finding tokenizations that are too long tokens_deletes = [] for i, attention in enumerate(source_tokens['attention_mask']): if len(attention) < max_token_size: break if attention[max_token_size]: tokens_deletes.append(i) deleted_samples = len(tokens_deletes) log('{} ({}%) of samples were filtered because they were too long'. format( deleted_samples, round((deleted_samples / len(source_tokens['attention_mask'])) * 100, 2))) for text_name, tokens in [('source', source_tokens), ('target', target_tokens)]: # creating filtered PyTorch tensors from # tokenization lists and replacing them if filtering: for key in tokens: # tokens contains `inputs_ids` and `attention_mask` tokens_list = tokens[key] for i in sorted(tokens_deletes, reverse=True): # actual filtering del tokens_list[i] tokens_tensor = torch.LongTensor( np.array(tokens_list)[:, :512]) tokens[key] = tokens_tensor tensor_path = f'{tensor_dir}{split_name}_{text_name}.pt' log(f'{tensor_path} with output size:', tokens[list(tokens.keys())[0]].size()) assertFileInxestent(tensor_path) torch.save(tokens, tensor_path)
def initialize_trainer(dataset_name: str, model_name: str, filtered: bool = True, config_name: Optional[str] = "fine_tuning_config.ini"): """fine tuning pipeline initialization Args: dataset_name (str): name of the dataset used for training model_name (str): model to fine tune on filtered (bool, optional): choose filtered or unfiltered tensors for training. Defaults to True. config_name (Optional[str], optional): name of config file. Defaults to "fine_tuning_config.ini". """ ################################### # Perform checks ################################### # check data path exists assert check_make_dir(DATA_DIRECTORY), \ f"Make sure directory {DATA_DIRECTORY} exists!" # check that there are data folders dataset_names = [ folder for folder in os.listdir(DATA_DIRECTORY) if os.path.isdir(DATA_DIRECTORY + folder) ] # check dataset folders assert len(dataset_names) > 0, \ f"Directory '{DATA_DIRECTORY}' is empty!" # check data name available assert dataset_name in dataset_names, \ f"'{dataset_name}' not in available datasets: {dataset_names}" # check tensors folder dataset_dir = os.path.join(DATA_DIRECTORY, dataset_name) if filtered: filter_str = "_filtered" else: filter_str = "" try: tensor_dir = os.path.join(dataset_dir, model_name + filter_str) assert os.path.isdir(tensor_dir) except: raise AssertionError( f"Neither '{tensor_dir} nor '{tensor_dir}_filtered' exists!") data_files = [file for file in os.listdir(tensor_dir) if '.pt' in file] # check training files assert len(data_files) > 0, \ f"'{tensor_dir}' is empty! Please provide '.pt' files!" data_dict = dict() for split_name in SPLIT_NAMES: if "test" not in split_name: files = list() for text_name in TEXT_NAMES: files.append(f"{split_name}_{text_name}.pt") if all([check_make_dir(tensor_dir + "/" + file) for file in files]): data_dict[split_name] = dict() for text_name in TEXT_NAMES: file_path = os.path.join(tensor_dir, f"{split_name}_{text_name}.pt") data_dict[split_name][text_name] = torch.load( open(file_path, "rb")) # check model is supported assert model_name in MODEL_NAMES, \ f"'{model_name}' not supported. Please choose one of {MODEL_NAMES}" # set to default config if not given if config_name is None: config_path = "fine_tuning_config.ini" else: # check .ini file assert ".ini" in config_name, \ "Config has to be an '.ini' file!" config_path = os.path.join("./modelTrainer/config", config_name) ################################### # Read from config ################################### MODEL, TRAINING = read_config(config_path) # model parameters model_parameters = dict() for parameter_name in MODEL_CONFIG: if MODEL[parameter_name]: model_parameters[parameter_name] = MODEL[parameter_name] # check if output directory exists check_make_dir(model_parameters["output_directory"], create_dir=True) ################################### # Initialize Model ################################### # initialize summary model model = AbstractiveSummarizer(model_parameters["model_directory"], model_parameters["language"], model_parameters["status"]) if model_parameters["freezed_components"] != "None": model.freeze_model_layers( model_parameters["freezed_components"].strip().split(";")) ################################### # Run fine tuning ################################### # training parameters training_parameters = dict() for parameter_name in TRAINING_CONFIG: if TRAINING[parameter_name]: training_parameters[parameter_name] = TRAINING[parameter_name] fine_tune_model(model, model_parameters["output_directory"], data_dict, training_parameters)
def fine_tune_model(summary_model: AbstractiveSummarizer, results_path: str, data_dict: dict, parameters: dict): """fine tuning pipeline that runs the training Args: summary_model (AbstractiveSummarizer): model to train on results_path (str): store resulting models and checkpoints data_dict (dict): data for training and optional evaluation parameters (dict): training parameters """ # limit samples taken into # account for training data_dict["train"] = limit_data(data_dict["train"], int(parameters["number_samples"])) # get dataset if "val" in data_dict: train_data, val_data = create_dataset( (data_dict["train"]["source"], data_dict["train"]["target"]), (data_dict["val"]["source"], data_dict["val"]["target"])) else: train_data, val_data = create_dataset( (data_dict["train"]["source"], data_dict["train"]["target"])) if parameters["limit_val_data"] != "-1": val_data = val_data[:min(len(val_data ), int(parameters["limit_val_data"]))] # recursively create output directory check_make_dir(results_path) model_type = summary_model.short_name check_make_dir(results_path + "/" + model_type) model_version = 0 final_path = os.path.join(results_path, model_type, str(model_version)) logs_path = os.path.join(results_path, model_type, "logs", str(model_version)) while check_make_dir(final_path, create_dir=True): if len(os.listdir(final_path)) == 0: break model_version += 1 final_path = os.path.join(results_path, model_type, str(model_version)) logs_path = os.path.join(results_path, model_type, "logs", str(model_version)) # prepare path for logs check_make_dir(logs_path, create_dir=True) # initialize the training parameters training_args = TrainingArguments( output_dir=final_path, num_train_epochs=int(parameters["epochs"]), per_device_train_batch_size=int(parameters["train_batch_size"]), per_device_eval_batch_size=int(parameters["val_batch_size"]) if val_data else None, do_eval=bool(val_data), eval_steps=int(parameters["eval_steps"]), evaluate_during_training=bool(val_data), warmup_steps=int(parameters["warmup_steps"]), weight_decay=float(parameters["weight_decay"]), logging_dir=logs_path, logging_steps=int(parameters["logging_steps"]), logging_first_step=True, save_steps=int(parameters["checkpoint_steps"]), do_train=True) # initialize the trainer class trainer = Trainer( model=summary_model.model, args=training_args, train_dataset=train_data, eval_dataset=val_data if val_data else None, prediction_loss_only=True # loss for logging ) log("started training") # perform the training try: training_history = trainer.train() # save training history with open(final_path + "/training_history.pickle", "wb") as history_file: pickle.dump(training_history, history_file) finally: # save info file info_dict = { "language": summary_model.language, "model_name": summary_model.model_name, "run_name": summary_model.short_name + "/" + str(model_version), "total_iterations": int(len(train_data) / int(parameters["train_batch_size"])) * int(parameters["epochs"]) } with open(final_path + "/model_info.yml", "w") as info_file: yaml.dump(info_dict, info_file) # save the fine tuned model check_make_dir(final_path, True) trainer.save_model(final_path)