def write_table(dictionary: dict,
                output_dir: str,
                file_name: str,
                file_format: str = "csv"):
    """write to excel or csv file given a dict

    Args:
        dictionary (dict): information to write
        output_dir (str): output diretory
        file_name (str): file name with ending
        file_format (str, optional): switch for excel or csv.
        Defaults to "csv".
    """

    assert file_format in [
        "csv", "excel"
    ], "'file_format' has to be either 'excel' or 'csv'!"
    output_path = os.path.join(output_dir, file_name)
    log("\nWrite results to", output_path)
    df = pd.DataFrame.from_dict(dictionary, orient="columns")
    if file_format == "csv":
        output_path += ".csv"
        df.to_csv(output_path, sep=";")
    else:
        with pd.ExcelWriter(output_path) as writer:
            df.to_excel(writer)
def read_data(in_path: str,
              text_name: Optional[str] = None,
              summary_name: Optional[str] = None,
              limit: Optional[int] = None) -> List[Union[Tuple, str]]:
    """general function to call all types of import functions

    Args:
        in_path (str): file path to read from
        text_name (Optional[str], optional): name of the text file.
        Defaults to None.
        summary_name (Optional[str], optional): name of the summary file.
        Defaults to None.
        limit (Optional[int], optional): data limitation.
        Defaults to None.

    Returns:
        List[Union[Tuple, str]]: all textes as list
    """
    if text_name is None \
            and summary_name is None:
        return read_single_txt(in_path)
    else:
        if all(".txt" in item for item in [text_name, summary_name]):
            return read_txt(in_path, text_name, summary_name, limit)
        elif all(".csv" in item for item in [text_name, summary_name]):
            return read_csv(in_path, text_name, summary_name, limit)
        elif all(".pickle" in item for item in [text_name, summary_name]):
            return read_pickle(in_path, text_name, summary_name, limit)
        else:
            log(f"{text_name} or {summary_name} is not supported!")
            exit()
def read_config(config_path: str) -> Tuple[dict, dict]:
    """read the .ini file which provides
       the configurations for the different
       pipeline runs

    Args:
        config_path (str): path to config .ini file

    Raises:
        FileNotFoundError: path is empty

    Returns:
        Tuple[dict, dict]: returns parameters 
        for config sections
    """
    config_dict = dict()
    log("Read from config", config_path)

    if not os.path.isfile(config_path):
        raise FileNotFoundError(config_path)

    config_parser.read(config_path)
    for section in config_parser.sections():
        config_dict.update({section: dict()})

        for entry in config_parser[section]:
            config_dict[section].update({entry: config_parser[section][entry]})
    try:
        return config_dict['MODEL'], config_dict['TRAINING']
    except Exception:
        return config_dict['MODEL'], config_dict['EVALUATION']
def write_pickle(obj: Union[object, list], file_name: str, file_path: str):
    """write python object to binary

    Args:
        obj (Union[object, list]): object to save
        file_name (str): name of the binary
        file_path (str): path for the binary
    """
    if not file_name.endswith('.pickle'):
        file_name += ".pickle"

    pickle_path = os.path.join(file_path, file_name)
    log("\nSave pickle to", pickle_path)
    with open(pickle_path, mode="wb") as file_handle:
        pickle.dump(obj, file_handle)
Example #5
0
def provide_data(dataset_name: str,
                 tokenizer_name: str,
                 model_name: str,
                 size: Optional[int] = None,
                 create_splits: Optional[bool] = False,
                 splits2tokenize: Optional[list] = SPLIT_NAMES,
                 filtering: Optional[bool] = True):
    """Provides tokenized data for training
    Args:
        dataset_name (str): foldername in datasets directory
        tokenizer_name (str): huggingface tokenizer name (same as model name)
        model_name (str): huggingface model name
        size (Optional[int], optional): {Limits the amount of samples that are taken for tokenization for each split.
        create_splits (Optional[bool], optional): Split the dataset into train, validation and test splits. Has to be provided as a dict containing the keys `train` and `val` and values between 0 and 1. If `True` uses a default 80/10/10 split. Defaults to False.
        splits2tokenize (Optional[list], optional): Can be set to only tokenize certain splits. Defaults to SPLIT_NAMES.
        filtering (Optional[bool], optional): Longer examples than the maximum token size are filtered, else they are truncated. Defaults to True.
    Raises:
        ValueError: incorrect inputs
        IOError: incompatible text and summary number"""
    # checking input
    if not model_name in MODEL_NAMES:
        raise ValueError('unkown model')
    if not tokenizer_name in TOKENIZER_NAMES:
        raise ValueError('unkown tokenizer')
    if size and size < 1:
        raise ValueError('wrong size')
    dataset_dir = f'dataProvider/datasets/{dataset_name}/'
    assertDirExistent(dataset_dir)

    if create_splits:
        if create_splits == True:
            create_splits = {'train': 0.8, 'val': 0.5}
        for split_key in create_splits:
            if not split_key in SPLIT_NAMES:
                raise ValueError(
                    f'unkown key {split_key} - createSplits has to be a \
                      dictionary containing the keys `train` and `val` \
                        and values between 0 and 1')
        data = {}
        data['source'] = read_single_txt(dataset_dir + 'sources.txt')
        data['target'] = read_single_txt(dataset_dir + 'targets.txt')
        entries = len(data['source'])
        assert entries == len(
            data['target']
        ), "Source and target must have the same amount of lines"
        for text_name in ['source', 'target']:
            text = data[text_name]
            previous_split_index = 0
            create_splits['test'] = 1.
            for split_name in SPLIT_NAMES:
                split_fraction = create_splits[split_name]
                if not 0 <= split_fraction <= 1:  # check split values
                    raise ValueError('incorrect split sizes')
                split_index = int((entries - previous_split_index) *
                                  split_fraction + previous_split_index)
                split = text[previous_split_index:split_index]
                if len(split):
                    write_txt(
                        '{}{}.{}'.format(dataset_dir, split_name, text_name),
                        split)
                    previous_split_index = split_index
            assert previous_split_index == entries, f'{previous_split_index} != {entries}'

    # tokenizing
    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
    max_token_size = tokenizer.max_model_input_sizes[model_name]
    if filtering:
        filtered = '_filtered'
    else:
        filtered = ''
    tensor_dir = f'{dataset_dir}{tokenizer_name}{filtered}/'
    check_make_dir(tensor_dir, True)
    for split_name in splits2tokenize:
        source = read_single_txt('{}{}.{}'.format(dataset_dir, split_name,
                                                  'source'))
        target = read_single_txt('{}{}.{}'.format(dataset_dir, split_name,
                                                  'target'))
        text_length = len(source)
        assert text_length == len(target)
        assert text_length > 0, f"split {split_name} has no entries"
        if size:  # optional limitation of samples for tokenization
            source = source[:size]
            target = target[:size]
        log(f'tokenizing target batch for {split_name} of {text_length} samples'
            )
        if filtering:
            target_tokens = tokenizer(target, padding=True)
        else:
            target_tokens = tokenizer(target,
                                      padding=True,
                                      return_tensors="pt")
        if len(target_tokens['attention_mask'][0]) > max_token_size:
            target_tokens = len(target_tokens['attention_mask'][0])
            raise IOError(
                f'target contains more than {max_token_size} tokens: {target_tokens}'
            )
        log(f'tokenizing source batch for {split_name}')
        if filtering:
            source_tokens = tokenizer(source,
                                      padding='max_length',
                                      truncation=True,
                                      max_length=max_token_size + 1)
        else:
            source_tokens = tokenizer(source,
                                      padding='max_length',
                                      truncation=True,
                                      return_tensors='pt')
        if filtering:  # finding tokenizations that are too long
            tokens_deletes = []
            for i, attention in enumerate(source_tokens['attention_mask']):
                if len(attention) < max_token_size:
                    break
                if attention[max_token_size]:
                    tokens_deletes.append(i)
            deleted_samples = len(tokens_deletes)
            log('{} ({}%) of samples were filtered because they were too long'.
                format(
                    deleted_samples,
                    round((deleted_samples /
                           len(source_tokens['attention_mask'])) * 100, 2)))
        for text_name, tokens in [('source', source_tokens),
                                  ('target', target_tokens)]:
            # creating filtered PyTorch tensors from
            # tokenization lists and replacing them
            if filtering:
                for key in tokens:  # tokens contains `inputs_ids` and `attention_mask`
                    tokens_list = tokens[key]
                    for i in sorted(tokens_deletes,
                                    reverse=True):  # actual filtering
                        del tokens_list[i]
                    tokens_tensor = torch.LongTensor(
                        np.array(tokens_list)[:, :512])
                    tokens[key] = tokens_tensor
            tensor_path = f'{tensor_dir}{split_name}_{text_name}.pt'
            log(f'{tensor_path} with output size:',
                tokens[list(tokens.keys())[0]].size())
            assertFileInxestent(tensor_path)
            torch.save(tokens, tensor_path)
Example #6
0
 def save_dataframe(self):
   self.analysis_table_path = self.eval_path + '/analysis.xlsx'
   self.analysis_df.to_excel(self.analysis_table_path)
   print(self.analysis_df)
   log("saved to", self.analysis_table_path)