Exemple #1
0
 def get_sequence_files(self, article, n_article_paragraphs):
     files = []
     for sequence_ix in range(n_article_paragraphs):
         if article["id"] in self.dev_ids:
             split_name = "development"
             subsplit = self.n_development_files // self.folder_size
             self.n_development_files += 1
         elif article["id"] in self.test_ids:
             split_name = "test"
             subsplit = self.n_test_files // self.folder_size
             self.n_test_files += 1
         else:
             split_name = "training"
             subsplit = self.n_training_files // self.folder_size
             self.n_training_files += 1
         subsplit = ("%." + str(SUBSPLIT_ID_LEN) + "i") % subsplit
         folder = self.out_directory + split_name + "/" + self.benchmark_name + "/texts/" + subsplit + "/"
         if not path_exists(folder):
             make_directory(folder)
         file_name_pattern = "%." + str(PARAGRAPH_ID_LEN) + "i_%s_%s_%." + str(SUB_SEQUENCE_ID_LEN) + "i.txt"
         file_name = file_name_pattern % (self.n_paragraphs,
                                          article["id"],
                                          article["title"][:MAX_TITLE_LEN].replace('/', '_'),
                                          sequence_ix)
         files.append(folder + file_name)
         self.n_paragraphs += 1
     return files
Exemple #2
0
 def _save_encoder(self):
     """
     Stores the encoder at the model directory.
     File name is encoder.pkl.
     :return:
     """
     make_directory(self.model_dir())
     dump_object(self.encoder, self.model_dir() + "/encoder.pkl")
Exemple #3
0
 def _save_specification(self):
     """
     Stores the specification at the model directory.
     File name is specification.pkl.
     :return:
     """
     make_directory(self.model_dir())
     dump_object(self.specification,
                 self.model_dir() + "/specification.pkl")
 def _results_folder(self):
     dir = paths.RESULTS_DIR
     if self.benchmark_name is not None:
         dir += self.benchmark_name + "/"
         if not path_exists(dir):
             make_directory(dir)
     dir += self.time_string if self.approach_name is None else self.approach_name
     if not path_exists(dir):
         make_directory(dir)
     return dir
 def write_predicted_sequence_files(self):
     dir = self._results_folder() + "/predicted/"
     make_directory(dir)
     # empty files
     for sequence_result in self.sequence_results:
         with open(dir + sequence_result.file_name[0], 'w') as file:
             pass
     # write to files
     for sequence_result in self.sequence_results:
         with open(dir + sequence_result.file_name[0], 'a') as file:
             file.write(sequence_result.predicted_sequence + '\n')
Exemple #6
0
 def prepare_directories(self):
     for split in ["training", "development", "test"]:
         split_path = self.out_directory + split
         if not path_exists(split_path):
             make_directory(split_path)
         benchmark_split_path = split_path + "/" + self.benchmark_name
         if not path_exists(benchmark_split_path):
             make_directory(benchmark_split_path)
         texts_path = benchmark_split_path + "/texts"
         if not path_exists(texts_path):
             make_directory(texts_path)
def corrupt_dataset(directory: str,
                    p: float,
                    splits: List[str],
                    seed: int):
    """Generates a corrupt dataset in paragraph format.

    The format is as follows:
    directory
    ---| training
    -------| <benchmark_name>
    -----------| texts
    ---------------| 0000
    -------------------| <sequence_file_name>
    -------------------| ...
    ---------------| ...
    ---| development
    ---| test

    :param directory: output directory
    :param p: corruption probability
    :param splits: subset of {training, development, test}, provided as a list
    :param seed: corruption random seed
    """
    corruptor = _corruptor(p, seed)
    benchmark_name = _benchmark_name(p)
    for split in splits:
        benchmark_split_dir = directory + split + "/" + benchmark_name + "/"
        if not path_exists(benchmark_split_dir):
            make_directory(benchmark_split_dir)
        text_dir = benchmark_split_dir + "texts/"
        if not path_exists(text_dir):
            make_directory(text_dir)
        for file in Wikipedia.file_iterator(benchmark_name="correct", split=split):
            sequence = Wikipedia.get_sequence(file)
            corrupt = corruptor.corrupt(sequence)
            path_split = file.split('/')
            path_split[-4] = benchmark_name
            folder = paths.WIKI_DIR + '/'.join(path_split[:-1])
            if not path_exists(folder):
                make_directory(folder)
            path = paths.WIKI_DIR + '/'.join(path_split)
            write_file(path, corrupt)
    corruptor.print_summary()
    while '  ' in text:
        text.replace('  ', ' ')
    return text


def preprocess_sentence(sentence: str) -> str:
    sentence = unify_quotation_marks(sentence)
    sentence = unify_spacing(sentence)
    return sentence


if __name__ == "__main__":
    TRAINING = sys.argv[1] == "training"
    random.seed(42)
    base_dir = paths.WIKI_DIR + "text/"
    make_directory(paths.WIKI_SENTENCES_DIR)

    training_ids, development_ids, test_ids = get_article_ids()
    tuning_ids = set(random.sample(training_ids, 10000))
    training_ids = set(training_ids)
    development_ids = set(development_ids)
    test_ids = set(test_ids)
    evaluation_ids = development_ids.union(test_ids).union(tuning_ids)

    tuning_sentences, development_sentences, test_sentences = [], [], []

    sentence_splitter = WikiPunktTokenizer()

    if TRAINING:
        training_file = open(paths.WIKI_TRAINING_SENTENCES,
                             'w',