Esempio n. 1
0
    def __init__(
        self,
        corpus_name: str,
        base_directory: Path,
        base_source_url_or_directory: str = "ketos:/projects/korpora/speech/",
        umlaut_decoder: Callable[[str],
                                 str] = UmlautDecoder.quote_before_umlaut,
        tar_gz_extension: str = ".tgz",
        mel_frequency_count: int = 128,
        root_compressed_directory_name_to_skip: Optional[str] = None,
        subdirectory_depth: int = 2,
        tags_to_ignore: Iterable[str] = _tags_to_ignore,
        id_filter_regex=re.compile('[\s\S]*'),
        training_test_split: Callable[[List[LabeledExample]], Tuple[
            List[LabeledExample], List[LabeledExample]]] = TrainingTestSplit.
        randomly_grouped_by_directory()):
        self.umlaut_decoder = umlaut_decoder

        log("Parsing corpus {}...".format(corpus_name))

        super().__init__(
            base_directory=base_directory,
            base_source_url_or_directory=base_source_url_or_directory,
            corpus_name=corpus_name,
            tar_gz_extension=tar_gz_extension,
            root_compressed_directory_name_to_skip=
            root_compressed_directory_name_to_skip,
            subdirectory_depth=subdirectory_depth,
            allowed_characters=german_frequent_characters,
            tags_to_ignore=tags_to_ignore,
            id_filter_regex=id_filter_regex,
            mel_frequency_count=mel_frequency_count,
            training_test_split=training_test_split,
            maximum_example_duration_in_s=35,
            minimum_duration_per_character=2 * 2 * 128 / 16000)
Esempio n. 2
0
    def __init__(self,
                 training_examples: List[LabeledExample],
                 test_examples: List[LabeledExample],
                 sampled_training_example_count: Optional[int] = None):

        self.training_examples = training_examples if sampled_training_example_count is None else \
            random.Random(42).sample(training_examples, sampled_training_example_count)

        self.sampled_training_example_count = sampled_training_example_count
        self.test_examples = test_examples
        self.examples = training_examples + test_examples

        log("Training on {} examples, testing on {} examples.".format(
            len(self.training_examples), len(self.test_examples)))

        duplicate_training_ids = duplicates(e.id for e in training_examples)
        if len(duplicate_training_ids) > 0:
            raise ValueError("Duplicate ids in training examples: {}".format(
                duplicate_training_ids))

        duplicate_test_ids = duplicates(e.id for e in test_examples)
        if len(duplicate_test_ids) > 0:
            raise ValueError("Duplicate ids in test examples: {}".format(
                duplicate_test_ids))

        overlapping_ids = duplicates(e.id for e in self.examples)

        if len(overlapping_ids) > 0:
            raise ValueError("Overlapping training and test set: {}".format(
                overlapping_ids))
Esempio n. 3
0
 def _load_from_cache(self):
     try:
         return numpy.load(str(self.spectrogram_cache_file))
     except ValueError:
         log("Recalculating cached file {} because loading failed.".format(
             self.spectrogram_cache_file))
         return self._calculate_and_save_spectrogram()
Esempio n. 4
0
    def validate_to_csv(
        model_name: str,
        last_epoch: int,
        configuration: Configuration = Configuration.german(),
        step_count=10,
        first_epoch: int = 0,
        csv_directory: Path = configuration.default_data_directories.
        test_results_directory
    ) -> List[Tuple[int, ExpectationsVsPredictionsInGroupedBatches]]:

        step_size = (last_epoch - first_epoch) / (step_count - 1)

        epochs = distinct(
            list(
                int(first_epoch + index * step_size)
                for index in range(step_count)))
        log("Testing model {} on epochs {}.".format(model_name, epochs))

        model = configuration.load_model(
            model_name,
            last_epoch,
            allowed_characters_for_loaded_model=configuration.
            allowed_characters,
            use_kenlm=True,
            language_model_name_extension="-incl-trans")

        def get_result(
                epoch: int) -> ExpectationsVsPredictionsInGroupedBatches:
            log("Testing epoch {}.".format(epoch))

            model.load_weights(
                allowed_characters_for_loaded_model=configuration.
                allowed_characters,
                load_model_from_directory=configuration.directories.
                nets_base_directory / model_name,
                load_epoch=epoch)

            return configuration.test_model_grouped_by_loaded_corpus_name(
                model)

        results_with_epochs = []

        csv_file = csv_directory / "{}.csv".format(model_name + "-incl-trans")
        import csv
        with csv_file.open('w', encoding='utf8') as opened_csv:
            writer = csv.writer(opened_csv,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

            for epoch in epochs:
                result = get_result(epoch)
                writer.writerow((epoch, result.average_loss,
                                 result.average_letter_error_rate,
                                 result.average_word_error_rate,
                                 result.average_letter_error_count,
                                 result.average_word_error_count))

        return results_with_epochs
Esempio n. 5
0
    def test_and_predict_batches_with_log(
        self, corpus_name: str, batches: Iterable[List[LabeledSpectrogram]]
    ) -> ExpectationsVsPredictionsInBatches:
        result = self.test_and_predict_batches(batches)

        log("{}: {}".format(corpus_name, result))

        return result
Esempio n. 6
0
    def test_and_predict_batch_with_log(
            self, index: int,
            batch: List[LabeledSpectrogram]) -> ExpectationsVsPredictions:
        result = self.test_and_predict_batch(batch)

        log(str(result) + " (batch {})".format(index))

        return result
Esempio n. 7
0
    def _download_if_not_yet_done(self, source_path_or_url: str, target_path: Path) -> Path:
        if not target_path.is_file():
            log("Downloading corpus {} to {}".format(source_path_or_url, target_path))
            if self.base_url_or_directory.startswith("http"):
                request.urlretrieve(source_path_or_url, str(target_path))
            else:
                try:
                    subprocess.check_output(["scp", source_path_or_url, str(target_path)], stderr=subprocess.STDOUT)
                except subprocess.CalledProcessError as e:
                    raise IOError("Copying failed: " + str(e.output))

        return target_path
Esempio n. 8
0
        def get_result(
                epoch: int) -> ExpectationsVsPredictionsInGroupedBatches:
            log("Testing epoch {}.".format(epoch))

            model.load_weights(
                allowed_characters_for_loaded_model=configuration.
                allowed_characters,
                load_model_from_directory=configuration.directories.
                nets_base_directory / model_name,
                load_epoch=epoch)

            return configuration.test_model_grouped_by_loaded_corpus_name(
                model)
Esempio n. 9
0
    def train_transfer_from_best_english_model(self, frozen_layer_count: int,
                                               reinitialize_trainable_loaded_layers: bool = False):
        run_name = timestamp() + "-adam-small-learning-rate-transfer-to-{}-freeze-{}{}{}".format(
            self.name, frozen_layer_count, "-reinitialize" if reinitialize_trainable_loaded_layers else "",
            self.sampled_training_example_count_extension())

        log("Run: " + run_name)

        wav2letter = self.load_best_english_model(
            frozen_layer_count=frozen_layer_count,
            reinitialize_trainable_loaded_layers=reinitialize_trainable_loaded_layers)

        self.train(wav2letter, run_name=run_name)
Esempio n. 10
0
    def test_model_grouped_by_loaded_corpus_name(self, wav2letter) -> ExpectationsVsPredictionsInGroupedBatches:
        def corpus_name(example: LabeledExampleFromFile) -> str:
            return example.audio_directory.relative_to(self.corpus_directory).parts[0]

        corpus_by_name = self.corpus.grouped_by(corpus_name)

        log([(name, len(corpus.test_examples)) for name, corpus in corpus_by_name.items()])
        result = wav2letter.test_and_predict_grouped_batches(OrderedDict(
            (corpus_name, self.batch_generator_for_corpus(corpus).test_batches()) for corpus_name, corpus in
            corpus_by_name.items()))
        log(result)

        return result
Esempio n. 11
0
    def test(self):
        l1 = LoggedRun(lambda: log("1"), "test1", Path())
        l1()

        self.assertEqual("1\n", l1.result_file.read_text())

        l2 = LoggedRun(lambda: log("2"), "test2", Path())
        l2()

        self.assertEqual("1\n", l1.result_file.read_text())
        self.assertEqual("2\n", l2.result_file.read_text())

        l1.result_file.unlink()
        l2.result_file.unlink()
Esempio n. 12
0
    def _extract_positional_label_by_id(
            self,
            files: Iterable[Path]) -> Dict[str, Union[PositionalLabel, str]]:
        json_ending = "_annot.json"
        json_annotation_files = \
            [file for file in files if file.name.endswith(json_ending) and
             self.id_filter_regex.match(file.name[:-len(json_ending)])]

        json_extracted = OrderedDict(
            (file.name[:-len(json_ending)],
             self._extract_positional_label_from_json(file))
            for file in json_annotation_files)

        par_annotation_files = [
            file for file in files if file.name.lower().endswith(".par") and
            self.id_filter_regex.match(name_without_extension(file).lower())
        ]

        extracted = OrderedDict(
            (name_without_extension(file), self._extract_label_from_par(file))
            for file in par_annotation_files)

        for key in set(extracted.keys()).intersection(
                set(json_extracted.keys())):
            json = json_extracted[key]
            json_label = json if isinstance(json, str) else json.label
            if extracted[key] != json_label:
                log('{}: "{}" extracted from par differ from json "{}"'.format(
                    key, extracted[key], json_label))

        # json has positional information and overrides par
        extracted.update(json_extracted)

        # TODO refactor
        if "ALC" in self.corpus_name:
            # exactly half have no label: can be fixed by using 0061006007_h_00.par or _annot.json instead of 0061006007_m_00_annot.json etc.
            correctly_labeled_id_marker = "_h_"
            empty_labeled_id_marker = "_m_"

            correct_ids = [
                id for id in extracted.keys()
                if correctly_labeled_id_marker in id
            ]
            for correct_id in correct_ids:
                empty_labeled_id = correct_id.replace(
                    correctly_labeled_id_marker, empty_labeled_id_marker)
                extracted[empty_labeled_id] = extracted[correct_id]

        return extracted
Esempio n. 13
0
            def merge_consecutive_ranges(
                    ranges: List[Tuple[int, int]]) -> Tuple[int, int]:
                def is_not_empty(range: Tuple[int, int]):
                    return range[0] + 1 != range[1]

                s = sorted((range for range in ranges if is_not_empty(range)),
                           key=lambda range: range[0])[:-1]
                for index, range in enumerate(s):
                    next_range = ranges[index + 1]

                    if range[1] != next_range[0]:
                        log("Ranges {} of a word are not consecutive.".format(
                            s))

                return ranges[0][0], ranges[-1][1]
Esempio n. 14
0
    def fill_cache(self, repair_incorrect: bool = False) -> None:
        with Pool(processes=multiprocessing.cpu_count()) as pool:
            total = len(self.labeled_spectrograms)
            not_yet_cached = [
                s for s in self.labeled_spectrograms if not s.is_cached()
            ]

            to_calculate = self.labeled_spectrograms if repair_incorrect else not_yet_cached

            log("Filling cache with {} spectrograms: {} already cached, {} to calculate."
                .format(total, total - len(not_yet_cached), len(to_calculate)))
            for index, labeled_spectrogram in enumerate(to_calculate):
                pool.apply_async(
                    _repair_cached_spectrogram_if_incorrect if repair_incorrect
                    else _cache_spectrogram, (labeled_spectrogram, ))

            pool.close()
            pool.join()
Esempio n. 15
0
    def indices_to_load_by_target_index(
            allowed_characters_for_loaded_model: List[chr],
            allowed_characters: List[chr]) -> List[Optional[int]]:

        load_character_set = set(allowed_characters_for_loaded_model)
        target_character_set = set(allowed_characters)

        ignored = load_character_set - target_character_set
        if ignored:
            log("Ignoring characters {} from loaded model.".format(
                sorted(ignored)))

        extra = target_character_set - load_character_set
        if extra:
            log("Initializing extra characters {} not found in model.".format(
                sorted(extra)))

        def character_index_to_load(target_character: chr) -> Optional[int]:
            return single_or_none([
                index for index, character in enumerate(
                    allowed_characters_for_loaded_model)
                if character == target_character
            ])

        character_mapping = [
            character_index_to_load(character)
            for character in allowed_characters
        ]

        log("Character mapping: {}".format(character_mapping))

        return character_mapping
Esempio n. 16
0
    def train(self,
              labeled_spectrogram_batches: Iterable[List[LabeledSpectrogram]],
              preview_labeled_spectrogram_batch: List[LabeledSpectrogram],
              tensor_board_log_directory: Path, net_directory: Path,
              batches_per_epoch: int):
        print_preview_batch = lambda: log(
            self.test_and_predict_batch(preview_labeled_spectrogram_batch))

        print_preview_batch()
        self.loss_net.fit_generator(
            self._loss_inputs_generator(labeled_spectrogram_batches),
            epochs=100000000,
            steps_per_epoch=batches_per_epoch,
            callbacks=self.create_callbacks(
                callback=print_preview_batch,
                tensor_board_log_directory=tensor_board_log_directory,
                net_directory=net_directory),
            initial_epoch=self.load_epoch if
            (self.load_epoch is not None) else 0)
Esempio n. 17
0
        freeze8 = (
            "20170525-181412-adam-small-learning-rate-transfer-to-German-freeze-8",
            1924)
        freeze8_100h = (
            "20170525-181449-adam-small-learning-rate-transfer-to-German-freeze-8-50000examples",
            1966)
        freeze8_20h = (
            "20170525-181524-adam-small-learning-rate-transfer-to-German-freeze-8-10000examples",
            2033)

    if gethostname() == "ketos":
        ketos_spectrogram_cache_base_directory = configuration.default_data_directories.data_directory / "ketos-spectrogram-cache"
        ketos_kenlm_base_directory = configuration.default_data_directories.data_directory / "ketos-kenlm"

        log("Running on ketos, using spectrogram cache base directory {} and kenlm base directory {}"
            .format(ketos_spectrogram_cache_base_directory,
                    ketos_kenlm_base_directory))
        configuration.default_data_directories.spectrogram_cache_base_directory = ketos_spectrogram_cache_base_directory
        configuration.default_data_directories.kenlm_base_directory = ketos_kenlm_base_directory
    else:
        restrict_gpu_memory()

    # Configuration.german().train_from_beginning()
    # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=8, reinitialize_trainable_loaded_layers=True)
    # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=0)
    # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=6)
    # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=9)
    # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=10)

    # Configuration.german().train_transfer_from_best_english_model(frozen_layer_count=8)
    # Configuration.german(sampled_training_example_count_when_loading_from_cached=50000).train_transfer_from_best_english_model(frozen_layer_count=8)
Esempio n. 18
0
 def test_model(self, wav2letter):
     log(wav2letter.test_and_predict_batch(self.batch_generator.preview_batch()))
     log(wav2letter.test_and_predict_batches(self.batch_generator.test_batches()))
Esempio n. 19
0
 def summarize_and_save_corpus(self):
     log(self.corpus.summary())
     self.corpus.summarize_to_csv(self.corpus_directory / "summary.csv")
     self.save_corpus()
Esempio n. 20
0
 def print_preview_batch():
     return log(
         self.test_and_predict_batch(preview_labeled_spectrogram_batch))
Esempio n. 21
0
 def duration_in_s(self) -> float:
     try:
         return librosa.get_duration(filename=str(self.audio_file))
     except Exception as e:
         log("Failed to get duration of {}: {}".format(self.audio_file, e))
         return 0
Esempio n. 22
0
    def load_weights(self,
                     allowed_characters_for_loaded_model: List[chr],
                     load_epoch: int,
                     load_model_from_directory: Path,
                     loaded_first_layers_count: Optional[int] = None):
        if allowed_characters_for_loaded_model is None:
            self.predictive_net.load_weights(
                str(load_model_from_directory /
                    self.model_file_name(load_epoch)))
        else:
            layer_count = len(self.predictive_net.layers)

            if loaded_first_layers_count is None:
                loaded_first_layers_count = layer_count

            original_wav2letter = Wav2Letter(
                input_size_per_time_step=self.input_size_per_time_step,
                allowed_characters=allowed_characters_for_loaded_model,
                use_raw_wave_input=self.use_raw_wave_input,
                activation=self.activation,
                output_activation=self.output_activation,
                optimizer=self.optimizer,
                dropout=self.dropout,
                load_model_from_directory=load_model_from_directory,
                load_epoch=load_epoch,
                frozen_layer_count=self.frozen_layer_count,
                use_asg=self.use_asg,
                asg_initial_probabilities=self.asg_initial_probabilities,
                asg_transition_probabilities=self.asg_transition_probabilities)

            log("Loading first {} layers of {}, epoch {}, reinitializing the last {}."
                .format(loaded_first_layers_count, load_model_from_directory,
                        load_epoch, layer_count - loaded_first_layers_count))

            for index, layer in enumerate(
                    self.predictive_net.layers[:loaded_first_layers_count]):
                original_weights, original_biases = original_wav2letter.predictive_net.layers[
                    index].get_weights()

                if index == len(self.predictive_net.layers) - 1:
                    indices_to_load_by_target_index = self.indices_to_load_by_target_index(
                        allowed_characters_for_loaded_model,
                        self.grapheme_encoding.allowed_characters)

                    def get_grapheme_index_to_load(target_grapheme_index: int):
                        if target_grapheme_index == self.grapheme_encoding.ctc_blank:
                            return original_wav2letter.grapheme_encoding.ctc_blank

                        return indices_to_load_by_target_index[
                            target_grapheme_index]

                    original_shape = original_weights.shape

                    def loaded_character_weights(
                            index: Optional[int]) -> ndarray:
                        return original_weights[:, :, index:index + 1] if index else \
                            zeros((original_shape[0], original_shape[1], 1))

                    def loaded_character_bias(index: Optional[int]) -> int:
                        return original_biases[index] if index else 0

                    grapheme_indices_to_load = \
                        [get_grapheme_index_to_load(target_grapheme_index)
                         for target_grapheme_index in range(self.grapheme_encoding.grapheme_set_size)]

                    original_weights = numpy.concatenate([
                        loaded_character_weights(index)
                        for index in grapheme_indices_to_load
                    ],
                                                         axis=2)

                    original_biases = numpy.array([
                        loaded_character_bias(index)
                        for index in grapheme_indices_to_load
                    ])

                layer.set_weights([original_weights, original_biases])
Esempio n. 23
0
    def create_predictive_net(self) -> Sequential:
        """Returns the part of the net that predicts grapheme probabilities given a spectrogram.
         A loss operation is not contained.
         As described here: https://arxiv.org/pdf/1609.03193v2.pdf
        """
        def convolution(name: str,
                        filter_count: int,
                        filter_length: int,
                        strides: int = 1,
                        activation: str = self.activation,
                        input_dim: int = None,
                        never_dropout: bool = False) -> List[Layer]:
            return ([] if self.dropout is None or never_dropout else [
                Dropout(self.dropout,
                        input_shape=(None, input_dim),
                        name="dropout_before_{}".format(name))
            ]) + [
                Conv1D(filters=filter_count,
                       kernel_size=filter_length,
                       strides=strides,
                       activation=activation,
                       name=name,
                       input_shape=(None, input_dim),
                       padding="same")
            ]

        main_filter_count = 250

        def input_convolutions() -> List[Conv1D]:
            raw_wave_convolution_if_needed = convolution(
                "wave_conv",
                filter_count=main_filter_count,
                filter_length=250,
                strides=160,
                input_dim=self.input_size_per_time_step
            ) if self.use_raw_wave_input else []

            return raw_wave_convolution_if_needed + convolution(
                "striding_conv",
                filter_count=main_filter_count,
                filter_length=48,
                strides=2,
                input_dim=None
                if self.use_raw_wave_input else self.input_size_per_time_step)

        def inner_convolutions() -> List[Conv1D]:
            return [
                layer for i in range(1, 8)
                for layer in convolution("inner_conv_{}".format(i),
                                         filter_count=main_filter_count,
                                         filter_length=7)
            ]

        def output_convolutions() -> List[Conv1D]:
            out_filter_count = 2000
            return [
                layer for conv in [
                    convolution("big_conv_1",
                                filter_count=out_filter_count,
                                filter_length=32,
                                never_dropout=True),
                    convolution("big_conv_2",
                                filter_count=out_filter_count,
                                filter_length=1,
                                never_dropout=True),
                    convolution(
                        "output_conv",
                        filter_count=self.grapheme_encoding.grapheme_set_size,
                        filter_length=1,
                        activation=self.output_activation,
                        never_dropout=True)
                ] for layer in conv
            ]

        layers = input_convolutions() + inner_convolutions(
        ) + output_convolutions()

        if self.frozen_layer_count > 0:
            log("All but {} layers frozen.".format(
                len(layers) - self.frozen_layer_count))

        for layer in layers[:self.frozen_layer_count]:
            layer.trainable = False

        return Sequential(layers)