def _results_folder(self): dir = paths.RESULTS_DIR if self.benchmark_name is not None: dir += self.benchmark_name + "/" if not path_exists(dir): make_directory(dir) dir += self.time_string if self.approach_name is None else self.approach_name if not path_exists(dir): make_directory(dir) return dir
def prepare_directories(self): for split in ["training", "development", "test"]: split_path = self.out_directory + split if not path_exists(split_path): make_directory(split_path) benchmark_split_path = split_path + "/" + self.benchmark_name if not path_exists(benchmark_split_path): make_directory(benchmark_split_path) texts_path = benchmark_split_path + "/texts" if not path_exists(texts_path): make_directory(texts_path)
def get_sequence_files(self, article, n_article_paragraphs): files = [] for sequence_ix in range(n_article_paragraphs): if article["id"] in self.dev_ids: split_name = "development" subsplit = self.n_development_files // self.folder_size self.n_development_files += 1 elif article["id"] in self.test_ids: split_name = "test" subsplit = self.n_test_files // self.folder_size self.n_test_files += 1 else: split_name = "training" subsplit = self.n_training_files // self.folder_size self.n_training_files += 1 subsplit = ("%." + str(SUBSPLIT_ID_LEN) + "i") % subsplit folder = self.out_directory + split_name + "/" + self.benchmark_name + "/texts/" + subsplit + "/" if not path_exists(folder): make_directory(folder) file_name_pattern = "%." + str(PARAGRAPH_ID_LEN) + "i_%s_%s_%." + str(SUB_SEQUENCE_ID_LEN) + "i.txt" file_name = file_name_pattern % (self.n_paragraphs, article["id"], article["title"][:MAX_TITLE_LEN].replace('/', '_'), sequence_ix) files.append(folder + file_name) self.n_paragraphs += 1 return files
def find_wikipedia_dir() -> str: """Searches for the extracted Wikipedia files at pre-defined paths. :return: Absolute path to the extracted Wikipedia files. """ dirs = anonymous_paths.EXTRACTED_WIKI_DIRS for dir in dirs: if path_exists(dir): return dir raise Exception("Could not find extracted Wikipedia directory.")
def corrupt_dataset(directory: str, p: float, splits: List[str], seed: int): """Generates a corrupt dataset in paragraph format. The format is as follows: directory ---| training -------| <benchmark_name> -----------| texts ---------------| 0000 -------------------| <sequence_file_name> -------------------| ... ---------------| ... ---| development ---| test :param directory: output directory :param p: corruption probability :param splits: subset of {training, development, test}, provided as a list :param seed: corruption random seed """ corruptor = _corruptor(p, seed) benchmark_name = _benchmark_name(p) for split in splits: benchmark_split_dir = directory + split + "/" + benchmark_name + "/" if not path_exists(benchmark_split_dir): make_directory(benchmark_split_dir) text_dir = benchmark_split_dir + "texts/" if not path_exists(text_dir): make_directory(text_dir) for file in Wikipedia.file_iterator(benchmark_name="correct", split=split): sequence = Wikipedia.get_sequence(file) corrupt = corruptor.corrupt(sequence) path_split = file.split('/') path_split[-4] = benchmark_name folder = paths.WIKI_DIR + '/'.join(path_split[:-1]) if not path_exists(folder): make_directory(folder) path = paths.WIKI_DIR + '/'.join(path_split) write_file(path, corrupt) corruptor.print_summary()
def initialize(self, specification, encoder): """ Initializes a new estimator model. Binds the given specification and encoder to self. Removes estimator model with the same name if stored at the default model directory as given by method model_dir. Creates Estimator and binds it to self. :param specification: specification holding the model's hyperparameters :param encoder: encoder that encodes text to subword unit labels (and decodes too) """ self.specification = specification self.encoder = encoder if path_exists(self.model_dir()): remove_dir(self.model_dir()) self.estimator = self._make_estimator() self._save_specification() self._save_encoder()
from typing import Optional from src.helper.files import path_exists, make_directory from src.benchmark.subset import Subset # BASE DIRECTORY DUMP_DIRS = [ #"/home/hertel/tokenization-repair-dumps/data_naacl2021/", # repro "/home/hertel/tokenization-repair-dumps/data/", # wunderfitz "/local/data/hertelm/tokenization-repair-dumps/data/", # sirba "/data/1/matthias-hertel/tokenization-repair-dumps/data/", # polyaxon "/external/" # docker ] DUMP_DIR = None for dir in DUMP_DIRS: if path_exists(dir): DUMP_DIR = dir print("Located data folder: %s" % DUMP_DIR) break if DUMP_DIR is None: raise Exception("Unable to locate data folder.") # MODEL DIRECTORY FOR SERVER MODEL_FOLDER = DUMP_DIR + "models_server/" # ESTIMATOR DIRECTORY ESTIMATORS_DIR = DUMP_DIR + "estimators/" # DATA DIRECTORY DATA_DIR = DUMP_DIR + "data/"