def preprocess_data(dataset_name: str, split_name: str, nr_samples: int, tokenizer_name) -> Dict:
    data_set_dir = os.path.join(DATA_DIR, dataset_name)
    assert check_make_dir(data_set_dir), f"Data set '{dataset_name}' \
        not directory '{DATA_DIR}'. \
            Please store data there!"

    tensor_dir = os.path.join(
        data_set_dir,
        tokenizer_name
    )
    try:
        assert check_make_dir(tensor_dir) and os.listdir(tensor_dir)
    except Exception:
        tensor_dir += "_filtered"
        assert (check_make_dir(tensor_dir) and os.listdir(tensor_dir)), f"Neither '{tensor_dir.rstrip('_filtered')} not '{tensor_dir}' does exist or it is empty!"

    source_path = os.path.join(tensor_dir, f"{split_name}_source.pt")
    target_path = os.path.join(tensor_dir, f"{split_name}_target.pt")
    assert os.path.isfile(source_path) and os.path.isfile(target_path), f"Data pair '{source_path}' or '{target_path}' does not exist!"

    data_dict = {
        "source": torch.load(open(source_path, "rb")),
        "target": torch.load(open(target_path, "rb"))
    }
    return limit_data(data_dict, nr_samples)
Esempio n. 2
0
 def saveEmbeddingsOfDatatype(self, datasetPath: str, subsetType: str,
                              dataType: str, savePath: str):
     saveFilePath = join(savePath, f'{subsetType}_{dataType}.pt')
     embeddings = self.embedDataSubsetOfDatatype(datasetPath, subsetType,
                                                 dataType)
     io_utils.check_make_dir(savePath, create_dir=True)
     torch.save(embeddings, open(saveFilePath, 'wb'))
Esempio n. 3
0
 def save_data_frame(data_frame: pd.DataFrame,
                     output_path: str,
                     file_format: Optional[str] = "csv"):
     # check if output directory exists
     out_dir = os.path.dirname(output_path)
     check_make_dir(out_dir, create_dir=True)
     if file_format == "csv":
         output_path += ".csv"
         data_frame.to_csv(output_path, sep=";")
     else:
         with pd.ExcelWriter(output_path) as writer:
             data_frame.to_excel(writer, "Overview")
    def __init__(self,
                 model_dir: str,
                 language: str,
                 status: Optional[str] = 'base'):
        """set arguments to initialize the model used for summarization

        Args:
            model_dir (str): direction to load/store model
            language (str): supported language
            status (Optional[str], optional): sets if model is
            already fine-tuned or not. Defaults to 'base'.
        """
        self.model_path = model_dir

        assert language in ["english", "german"], \
            f"{language} is not a supported language!"
        self.language = language
        # available models
        # t5-base: for english texts
        # tWikinewsSum/t5-base-multi-de-wiki-news: for german texts
        if language == "english":
            self.model_name = 't5-base'
            self.short_name = 't5'

        elif language == "german":
            self.model_name = 'WikinewsSum/t5-base-multi-de-wiki-news'
            self.short_name = 't5-de'

        assert status in ['base', 'fine-tuned']
        self.status = status
        if self.status != 'base':
            assert check_make_dir(self.model_path), \
                f"Directory '{self.model_path}' doesn't exist! \
                    Please follow this folder structure."

        # initialize the model and tokenizer
        # based on parameters
        self.model, self.tokenizer = self.initialize_model()

        # init the spacy language model
        # for post processing output
        if language == "english":
            self.nlp = spacy.load("en")
        else:
            self.nlp = spacy.load("de")

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
Esempio n. 5
0
def provide_data(dataset_name: str,
                 tokenizer_name: str,
                 model_name: str,
                 size: Optional[int] = None,
                 create_splits: Optional[bool] = False,
                 splits2tokenize: Optional[list] = SPLIT_NAMES,
                 filtering: Optional[bool] = True):
    """Provides tokenized data for training
    Args:
        dataset_name (str): foldername in datasets directory
        tokenizer_name (str): huggingface tokenizer name (same as model name)
        model_name (str): huggingface model name
        size (Optional[int], optional): {Limits the amount of samples that are taken for tokenization for each split.
        create_splits (Optional[bool], optional): Split the dataset into train, validation and test splits. Has to be provided as a dict containing the keys `train` and `val` and values between 0 and 1. If `True` uses a default 80/10/10 split. Defaults to False.
        splits2tokenize (Optional[list], optional): Can be set to only tokenize certain splits. Defaults to SPLIT_NAMES.
        filtering (Optional[bool], optional): Longer examples than the maximum token size are filtered, else they are truncated. Defaults to True.
    Raises:
        ValueError: incorrect inputs
        IOError: incompatible text and summary number"""
    # checking input
    if not model_name in MODEL_NAMES:
        raise ValueError('unkown model')
    if not tokenizer_name in TOKENIZER_NAMES:
        raise ValueError('unkown tokenizer')
    if size and size < 1:
        raise ValueError('wrong size')
    dataset_dir = f'dataProvider/datasets/{dataset_name}/'
    assertDirExistent(dataset_dir)

    if create_splits:
        if create_splits == True:
            create_splits = {'train': 0.8, 'val': 0.5}
        for split_key in create_splits:
            if not split_key in SPLIT_NAMES:
                raise ValueError(
                    f'unkown key {split_key} - createSplits has to be a \
                      dictionary containing the keys `train` and `val` \
                        and values between 0 and 1')
        data = {}
        data['source'] = read_single_txt(dataset_dir + 'sources.txt')
        data['target'] = read_single_txt(dataset_dir + 'targets.txt')
        entries = len(data['source'])
        assert entries == len(
            data['target']
        ), "Source and target must have the same amount of lines"
        for text_name in ['source', 'target']:
            text = data[text_name]
            previous_split_index = 0
            create_splits['test'] = 1.
            for split_name in SPLIT_NAMES:
                split_fraction = create_splits[split_name]
                if not 0 <= split_fraction <= 1:  # check split values
                    raise ValueError('incorrect split sizes')
                split_index = int((entries - previous_split_index) *
                                  split_fraction + previous_split_index)
                split = text[previous_split_index:split_index]
                if len(split):
                    write_txt(
                        '{}{}.{}'.format(dataset_dir, split_name, text_name),
                        split)
                    previous_split_index = split_index
            assert previous_split_index == entries, f'{previous_split_index} != {entries}'

    # tokenizing
    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
    max_token_size = tokenizer.max_model_input_sizes[model_name]
    if filtering:
        filtered = '_filtered'
    else:
        filtered = ''
    tensor_dir = f'{dataset_dir}{tokenizer_name}{filtered}/'
    check_make_dir(tensor_dir, True)
    for split_name in splits2tokenize:
        source = read_single_txt('{}{}.{}'.format(dataset_dir, split_name,
                                                  'source'))
        target = read_single_txt('{}{}.{}'.format(dataset_dir, split_name,
                                                  'target'))
        text_length = len(source)
        assert text_length == len(target)
        assert text_length > 0, f"split {split_name} has no entries"
        if size:  # optional limitation of samples for tokenization
            source = source[:size]
            target = target[:size]
        log(f'tokenizing target batch for {split_name} of {text_length} samples'
            )
        if filtering:
            target_tokens = tokenizer(target, padding=True)
        else:
            target_tokens = tokenizer(target,
                                      padding=True,
                                      return_tensors="pt")
        if len(target_tokens['attention_mask'][0]) > max_token_size:
            target_tokens = len(target_tokens['attention_mask'][0])
            raise IOError(
                f'target contains more than {max_token_size} tokens: {target_tokens}'
            )
        log(f'tokenizing source batch for {split_name}')
        if filtering:
            source_tokens = tokenizer(source,
                                      padding='max_length',
                                      truncation=True,
                                      max_length=max_token_size + 1)
        else:
            source_tokens = tokenizer(source,
                                      padding='max_length',
                                      truncation=True,
                                      return_tensors='pt')
        if filtering:  # finding tokenizations that are too long
            tokens_deletes = []
            for i, attention in enumerate(source_tokens['attention_mask']):
                if len(attention) < max_token_size:
                    break
                if attention[max_token_size]:
                    tokens_deletes.append(i)
            deleted_samples = len(tokens_deletes)
            log('{} ({}%) of samples were filtered because they were too long'.
                format(
                    deleted_samples,
                    round((deleted_samples /
                           len(source_tokens['attention_mask'])) * 100, 2)))
        for text_name, tokens in [('source', source_tokens),
                                  ('target', target_tokens)]:
            # creating filtered PyTorch tensors from
            # tokenization lists and replacing them
            if filtering:
                for key in tokens:  # tokens contains `inputs_ids` and `attention_mask`
                    tokens_list = tokens[key]
                    for i in sorted(tokens_deletes,
                                    reverse=True):  # actual filtering
                        del tokens_list[i]
                    tokens_tensor = torch.LongTensor(
                        np.array(tokens_list)[:, :512])
                    tokens[key] = tokens_tensor
            tensor_path = f'{tensor_dir}{split_name}_{text_name}.pt'
            log(f'{tensor_path} with output size:',
                tokens[list(tokens.keys())[0]].size())
            assertFileInxestent(tensor_path)
            torch.save(tokens, tensor_path)
Esempio n. 6
0
def initialize_trainer(dataset_name: str,
                       model_name: str,
                       filtered: bool = True,
                       config_name: Optional[str] = "fine_tuning_config.ini"):
    """fine tuning pipeline initialization

    Args:
        dataset_name (str): name of the dataset used for training
        model_name (str): model to fine tune on
        filtered (bool, optional): choose filtered or unfiltered tensors for training. Defaults to True.
        config_name (Optional[str], optional): name of config file.
        Defaults to "fine_tuning_config.ini".
    """
    ###################################
    # Perform checks
    ###################################

    # check data path exists
    assert check_make_dir(DATA_DIRECTORY), \
        f"Make sure directory {DATA_DIRECTORY} exists!"

    # check that there are data folders
    dataset_names = [
        folder for folder in os.listdir(DATA_DIRECTORY)
        if os.path.isdir(DATA_DIRECTORY + folder)
    ]

    # check dataset folders
    assert len(dataset_names) > 0, \
        f"Directory '{DATA_DIRECTORY}' is empty!"

    # check data name available
    assert dataset_name in dataset_names, \
        f"'{dataset_name}' not in available datasets: {dataset_names}"

    # check tensors folder
    dataset_dir = os.path.join(DATA_DIRECTORY, dataset_name)
    if filtered:
        filter_str = "_filtered"
    else:
        filter_str = ""
    try:
        tensor_dir = os.path.join(dataset_dir, model_name + filter_str)
        assert os.path.isdir(tensor_dir)
    except:
        raise AssertionError(
            f"Neither '{tensor_dir} nor '{tensor_dir}_filtered' exists!")

    data_files = [file for file in os.listdir(tensor_dir) if '.pt' in file]

    # check training files
    assert len(data_files) > 0, \
        f"'{tensor_dir}' is empty! Please provide '.pt' files!"

    data_dict = dict()
    for split_name in SPLIT_NAMES:
        if "test" not in split_name:
            files = list()
            for text_name in TEXT_NAMES:
                files.append(f"{split_name}_{text_name}.pt")

            if all([check_make_dir(tensor_dir + "/" + file)
                    for file in files]):
                data_dict[split_name] = dict()
                for text_name in TEXT_NAMES:
                    file_path = os.path.join(tensor_dir,
                                             f"{split_name}_{text_name}.pt")
                    data_dict[split_name][text_name] = torch.load(
                        open(file_path, "rb"))

    # check model is supported
    assert model_name in MODEL_NAMES, \
        f"'{model_name}' not supported. Please choose one of {MODEL_NAMES}"

    # set to default config if not given
    if config_name is None:
        config_path = "fine_tuning_config.ini"
    else:
        # check .ini file
        assert ".ini" in config_name, \
            "Config has to be an '.ini' file!"
        config_path = os.path.join("./modelTrainer/config", config_name)

    ###################################
    # Read from config
    ###################################
    MODEL, TRAINING = read_config(config_path)

    # model parameters
    model_parameters = dict()
    for parameter_name in MODEL_CONFIG:
        if MODEL[parameter_name]:
            model_parameters[parameter_name] = MODEL[parameter_name]

    # check if output directory exists
    check_make_dir(model_parameters["output_directory"], create_dir=True)

    ###################################
    # Initialize Model
    ###################################

    # initialize summary model
    model = AbstractiveSummarizer(model_parameters["model_directory"],
                                  model_parameters["language"],
                                  model_parameters["status"])
    if model_parameters["freezed_components"] != "None":
        model.freeze_model_layers(
            model_parameters["freezed_components"].strip().split(";"))
    ###################################
    # Run fine tuning
    ###################################

    # training parameters
    training_parameters = dict()
    for parameter_name in TRAINING_CONFIG:
        if TRAINING[parameter_name]:
            training_parameters[parameter_name] = TRAINING[parameter_name]

    fine_tune_model(model, model_parameters["output_directory"], data_dict,
                    training_parameters)
def fine_tune_model(summary_model: AbstractiveSummarizer, results_path: str,
                    data_dict: dict, parameters: dict):
    """fine tuning pipeline that runs the training

    Args:
        summary_model (AbstractiveSummarizer): model to train on
        results_path (str): store resulting models and checkpoints
        data_dict (dict): data for training and optional evaluation
        parameters (dict): training parameters
    """

    # limit samples taken into
    # account for training
    data_dict["train"] = limit_data(data_dict["train"],
                                    int(parameters["number_samples"]))

    # get dataset
    if "val" in data_dict:
        train_data, val_data = create_dataset(
            (data_dict["train"]["source"], data_dict["train"]["target"]),
            (data_dict["val"]["source"], data_dict["val"]["target"]))
    else:
        train_data, val_data = create_dataset(
            (data_dict["train"]["source"], data_dict["train"]["target"]))

    if parameters["limit_val_data"] != "-1":
        val_data = val_data[:min(len(val_data
                                     ), int(parameters["limit_val_data"]))]

    # recursively create output directory
    check_make_dir(results_path)
    model_type = summary_model.short_name
    check_make_dir(results_path + "/" + model_type)
    model_version = 0

    final_path = os.path.join(results_path, model_type, str(model_version))
    logs_path = os.path.join(results_path, model_type, "logs",
                             str(model_version))
    while check_make_dir(final_path, create_dir=True):
        if len(os.listdir(final_path)) == 0:
            break
        model_version += 1
        final_path = os.path.join(results_path, model_type, str(model_version))
        logs_path = os.path.join(results_path, model_type, "logs",
                                 str(model_version))

    # prepare path for logs
    check_make_dir(logs_path, create_dir=True)

    # initialize the training parameters
    training_args = TrainingArguments(
        output_dir=final_path,
        num_train_epochs=int(parameters["epochs"]),
        per_device_train_batch_size=int(parameters["train_batch_size"]),
        per_device_eval_batch_size=int(parameters["val_batch_size"])
        if val_data else None,
        do_eval=bool(val_data),
        eval_steps=int(parameters["eval_steps"]),
        evaluate_during_training=bool(val_data),
        warmup_steps=int(parameters["warmup_steps"]),
        weight_decay=float(parameters["weight_decay"]),
        logging_dir=logs_path,
        logging_steps=int(parameters["logging_steps"]),
        logging_first_step=True,
        save_steps=int(parameters["checkpoint_steps"]),
        do_train=True)

    # initialize the trainer class
    trainer = Trainer(
        model=summary_model.model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data if val_data else None,
        prediction_loss_only=True  # loss for logging
    )

    log("started training")
    # perform the training
    try:
        training_history = trainer.train()
        # save training history
        with open(final_path + "/training_history.pickle",
                  "wb") as history_file:
            pickle.dump(training_history, history_file)

    finally:
        # save info file
        info_dict = {
            "language":
            summary_model.language,
            "model_name":
            summary_model.model_name,
            "run_name":
            summary_model.short_name + "/" + str(model_version),
            "total_iterations":
            int(len(train_data) / int(parameters["train_batch_size"])) *
            int(parameters["epochs"])
        }

        with open(final_path + "/model_info.yml", "w") as info_file:
            yaml.dump(info_dict, info_file)
        # save the fine tuned model
        check_make_dir(final_path, True)
        trainer.save_model(final_path)