Beispiel #1
0
    def _extract_positional_label_by_id(
            self,
            files: Iterable[Path]) -> Dict[str, Union[PositionalLabel, str]]:
        json_ending = "_annot.json"
        json_annotation_files = \
            [file for file in files if file.name.endswith(json_ending) and
             self.id_filter_regex.match(file.name[:-len(json_ending)])]

        json_extracted = OrderedDict(
            (file.name[:-len(json_ending)],
             self._extract_positional_label_from_json(file))
            for file in json_annotation_files)

        par_annotation_files = [
            file for file in files if file.name.lower().endswith(".par") and
            self.id_filter_regex.match(name_without_extension(file).lower())
        ]

        extracted = OrderedDict(
            (name_without_extension(file), self._extract_label_from_par(file))
            for file in par_annotation_files)

        for key in set(extracted.keys()).intersection(
                set(json_extracted.keys())):
            json = json_extracted[key]
            json_label = json if isinstance(json, str) else json.label
            if extracted[key] != json_label:
                log('{}: "{}" extracted from par differ from json "{}"'.format(
                    key, extracted[key], json_label))

        # json has positional information and overrides par
        extracted.update(json_extracted)

        # TODO refactor
        if "ALC" in self.corpus_name:
            # exactly half have no label: can be fixed by using 0061006007_h_00.par or _annot.json instead of 0061006007_m_00_annot.json etc.
            correctly_labeled_id_marker = "_h_"
            empty_labeled_id_marker = "_m_"

            correct_ids = [
                id for id in extracted.keys()
                if correctly_labeled_id_marker in id
            ]
            for correct_id in correct_ids:
                empty_labeled_id = correct_id.replace(
                    correctly_labeled_id_marker, empty_labeled_id_marker)
                extracted[empty_labeled_id] = extracted[correct_id]

        return extracted
Beispiel #2
0
    def __init__(self,
                 audio_file: Path,
                 id: Optional[str] = None,
                 sample_rate_to_convert_to: int = 16000,
                 label: Optional[str] = "nolabel",
                 fourier_window_length: int = 512,
                 hop_length: int = 128,
                 mel_frequency_count: int = 128,
                 label_with_tags: str = None,
                 positional_label: Optional[PositionalLabel] = None):
        # The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper.

        if id is None:
            id = name_without_extension(audio_file)

        self.audio_file = audio_file

        super().__init__(id=id,
                         get_raw_audio=lambda: librosa.load(
                             str(self.audio_file), sr=self.sample_rate)[0],
                         label=label,
                         sample_rate=sample_rate_to_convert_to,
                         fourier_window_length=fourier_window_length,
                         hop_length=hop_length,
                         mel_frequency_count=mel_frequency_count,
                         label_with_tags=label_with_tags,
                         positional_label=positional_label)
Beispiel #3
0
 def move_incorrect_cached_file_to_backup_location_and_save_error(
         self, error_text: str):
     parent_directory = Path(self.spectrogram_cache_file.parent)
     incorrect_cached_backup_directory = Path(
         parent_directory.parent / (parent_directory.name + "-incorrect"))
     mkdir(incorrect_cached_backup_directory)
     incorrect_backup_file = incorrect_cached_backup_directory / self.spectrogram_cache_file.name
     incorrect_backup_message_file = incorrect_cached_backup_directory / (
         name_without_extension(self.spectrogram_cache_file) + "-error.txt")
     write_text(incorrect_backup_message_file, error_text)
     self.spectrogram_cache_file.rename(incorrect_backup_file)
Beispiel #4
0
    def _extract_positional_label_by_id(
            self,
            files: Iterable[Path]) -> Dict[str, Union[PositionalLabel, str]]:
        xml_ending = ".xml"

        microphone_endings = [
            "_Yamaha", "_Kinect-Beam", "_Kinect-RAW", "_Realtek", "_Samson",
            "_Microsoft-Kinect-Raw"
        ]

        xml_files = [
            file for file in files if file.name.endswith(xml_ending)
            if self.id_filter_regex.match(name_without_extension(file))
        ]

        return OrderedDict(
            (name_without_extension(file) + microphone_ending,
             self._extract_label_from_xml(file)) for file in xml_files
            for microphone_ending in microphone_endings
            if (Path(file.parent) / (name_without_extension(file) +
                                     microphone_ending + ".wav")).exists())
Beispiel #5
0
        def example(audio_file: Path) -> LabeledExample:
            id = name_without_extension(audio_file)

            def correct_whitespace(text: str) -> str:
                return " ".join(text.split()).strip()

            def correct(label: str) -> str:
                return correct_whitespace(self._remove_tags_to_ignore(label))

            original_positional_label = positional_label_by_id[id]
            has_positions = isinstance(original_positional_label, PositionalLabel)
            positional_label = original_positional_label.with_corrected_labels(correct).convert_range_to_seconds(
                LabeledExampleFromFile.file_sample_rate(audio_file)) if has_positions else None
            return LabeledExampleFromFile(audio_file,
                                          mel_frequency_count=self.mel_frequency_count,
                                          label=positional_label.label if has_positions else correct(
                                              original_positional_label),
                                          label_with_tags=original_positional_label.label if has_positions else original_positional_label,
                                          positional_label=positional_label)
Beispiel #6
0
    def __init__(self,
                 base_directory: Path,
                 corpus_name: str,
                 base_source_url_or_directory:
                 str = "http://www.openslr.org/resources/12/",
                 tar_gz_extension: str = ".tar.gz",
                 mel_frequency_count: int = 128,
                 root_compressed_directory_name_to_skip: Optional[
                     str] = "LibriSpeech/",
                 subdirectory_depth: int = 3,
                 allowed_characters: List[chr] = english_frequent_characters,
                 tags_to_ignore: Iterable[str] = list(),
                 id_filter_regex=re.compile('[\s\S]*'),
                 training_test_split: Callable[[List[LabeledExample]], Tuple[
                     List[LabeledExample],
                     List[LabeledExample]]] = TrainingTestSplit.randomly(),
                 maximum_example_duration_in_s: Optional[int] = None,
                 minimum_duration_per_character: Optional[float] = None):
        self.minimum_duration_per_character_in_s = minimum_duration_per_character
        self.maximum_example_duration_in_s = maximum_example_duration_in_s
        self.training_test_split = training_test_split
        self.id_filter_regex = id_filter_regex
        self.tags_to_ignore = tags_to_ignore
        self.allowed_characters = allowed_characters
        self.subdirectory_depth = subdirectory_depth
        self.root_compressed_directory_name_to_skip = root_compressed_directory_name_to_skip
        self.base_directory = base_directory
        self.base_url_or_directory = base_source_url_or_directory
        self.tar_gz_extension = tar_gz_extension
        self.mel_frequency_count = mel_frequency_count
        self.corpus_name = corpus_name
        mkdir(base_directory)

        self.corpus_directory = self._download_and_unpack_if_not_yet_done(
            corpus_name=corpus_name)

        directories = [self.corpus_directory]
        for i in range(self.subdirectory_depth):
            directories = [
                subdirectory for directory in directories
                for subdirectory in directory.iterdir()
                if subdirectory.is_dir()
            ]

        self.files = [
            file for directory in directories for file in directory.iterdir()
            if file.is_file()
        ]

        self.unfiltered_audio_files = [
            file for file in self.files
            if (file.name.lower().endswith(".flac")
                or file.name.lower().endswith(".wav"))
        ]
        audio_files = [
            file for file in self.unfiltered_audio_files
            if self.id_filter_regex.match(name_without_extension(file))
        ]
        self.filtered_out_count = len(
            self.unfiltered_audio_files) - len(audio_files)

        positional_label_by_id = self._extract_positional_label_by_id(
            self.files)
        found_audio_ids = set(name_without_extension(f) for f in audio_files)
        found_label_ids = positional_label_by_id.keys()
        self.audio_ids_without_label = list(found_audio_ids - found_label_ids)
        self.label_ids_without_audio = list(found_label_ids - found_audio_ids)

        def example(audio_file: Path) -> LabeledExample:
            id = name_without_extension(audio_file)

            def correct_whitespace(text: str) -> str:
                return " ".join(text.split()).strip()

            def correct(label: str) -> str:
                return correct_whitespace(self._remove_tags_to_ignore(label))

            original_positional_label = positional_label_by_id[id]
            has_positions = isinstance(original_positional_label,
                                       PositionalLabel)
            positional_label = original_positional_label.with_corrected_labels(
                correct).convert_range_to_seconds(
                    LabeledExampleFromFile.file_sample_rate(
                        audio_file)) if has_positions else None
            return LabeledExampleFromFile(
                audio_file,
                mel_frequency_count=self.mel_frequency_count,
                label=positional_label.label
                if has_positions else correct(original_positional_label),
                label_with_tags=original_positional_label.label
                if has_positions else original_positional_label,
                positional_label=positional_label)

        self.examples_with_empty_and_too_long_or_short = [
            example(file) for file in audio_files
            if name_without_extension(file) in positional_label_by_id.keys()
        ]

        self.examples_with_too_long_or_short = [
            e for e in self.examples_with_empty_and_too_long_or_short
            if e.label
        ]

        self.examples_with_too_short = [
            e for e in self.examples_with_too_long_or_short
            if not self.is_too_long(e)
        ]

        examples = [
            e for e in self.examples_with_too_short if not self.is_too_short(e)
        ]

        training_examples, test_examples = self.training_test_split(
            sorted(examples, key=lambda x: x.id))

        super().__init__(training_examples=training_examples,
                         test_examples=test_examples)