Example #1
0
    def run(self):
        with uopen(self.cluster_names, "rt") as f:
            cluster_names = [l.strip() for l in f]

        clusters = collections.defaultdict(set)
        with uopen(self.cluster_map, "rt") as f:
            t = ET.parse(f)
            for mi in t.findall(".//map-item"):
                k = mi.attrib["key"]
                v = int(mi.attrib["value"].split(".")[-1]) - 1
                clusters[cluster_names[v]].add(k)

        c = corpus.Corpus()
        c.load(tk.uncached_path(self.corpus_file))

        original_segments = {}
        for s in c.segments():
            original_segments[s.fullname()] = s

        audio = {}
        transcriptions = {}
        for cluster_name in clusters:
            clusters[cluster_name] = list(sorted(clusters[cluster_name]))
            transcriptions[cluster_name] = " ".join(
                original_segments[s].orth for s in clusters[cluster_name])
            audio[cluster_name] = [
                (r.audio, s.start, s.end) for n in clusters[cluster_name]
                for s, r in [(original_segments[n],
                              original_segments[n].recording)]
            ]

        new_c = corpus.Corpus()
        new_c.name = c.name
        for cluster_name, audio_files in audio.items():
            out_path = os.path.join(self.audio_output.get_path(),
                                    cluster_name + ".wav")
            if os.path.exists(out_path):
                os.unlink(out_path)
            with open(f"{cluster_name}.txt", "wt") as f:
                for af in audio_files:
                    f.write(f"file {af[0]}\ninpoint {af[1]}\n")
                    if not math.isinf(af[2]):
                        f.write(f"outpoint {af[2]}\n")
            self.sh(
                f"ffmpeg -loglevel fatal -hide_banner -f concat -safe 0 -i '{cluster_name}.txt' '{out_path}'"
            )

            r = corpus.Recording()
            r.name = cluster_name
            r.audio = out_path
            s = corpus.Segment()
            s.name = "1"
            s.start = 0.0
            s.end = float("inf")
            s.orth = transcriptions[cluster_name]
            r.add_segment(s)

            new_c.add_recording(r)

        new_c.dump(self.output_corpus.get_path())
Example #2
0
    def run(self):
        c = corpus.Corpus()
        nc = corpus.Corpus()
        segment_file_names = []

        c.load(tk.uncached_path(self.bliss_corpus))
        nc.name = self.corpus_name
        nc.speakers = c.speakers
        nc.default_speaker = c.default_speaker
        nc.speaker_name = c.speaker_name
        # store index of last segment
        for r in c.recordings:
            sr = corpus.Recording()
            sr.name = r.name
            sr.segments = r.segments
            sr.speaker_name = r.speaker_name
            sr.speakers = r.speakers
            sr.default_speaker = r.default_speaker
            sr.audio = r.audio
            nc.add_recording(sr)
            for s in sr.segments:
                segment_file_names.append(nc.name + "/" + sr.name + "/" +
                                          s.name)
                s.start += self.shift

        nc.dump(str(self.out_shifted_corpus))

        with open(str(self.out_segments), "w") as segments_outfile:
            segments_outfile.writelines(segment_file_names)
Example #3
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        dictionary = {}

        segments = None
        if self.segment_file:
            with uopen(self.segment_file) as f:
                segments = set(line.decode().strip() for line in f)

        for segment in c.segments():
            orth = segment.orth.strip()
            key = segment.fullname()
            if segments:
                if (not self.invert_match and key not in segments
                        and segment.name not in segments):
                    continue
                if self.invert_match and key in segments:
                    continue
            dictionary[key] = orth

        dictionary_string = pprint.pformat(dictionary, width=1000)
        with uopen(self.out_dictionary, "wt") as f:
            f.write(dictionary_string)
Example #4
0
    def run(self):

        segments = []
        for seg in self.segment_file_list:
            with open(tk.uncached_path(seg)) as f:
                lines = f.readlines()
                segments += [l.strip() for l in lines]

        logging.info("There are #{} segments in the segment list.".format(
            len(segments)))
        segments = set(segments)
        c = corpus.Corpus()
        c.load(tk.uncached_path(self.bliss_corpus))
        for rec in c.all_recordings():
            if self.invert_match:
                rec.segments = [
                    x for x in rec.segments
                    if x.fullname() not in segments and x.name not in segments
                ]
            else:
                rec.segments = [
                    x for x in rec.segments
                    if x.fullname() in segments or x.name in segments
                ]

        c.dump(tk.uncached_path(self.out_corpus))
Example #5
0
    def run(self):
        def maybe_to_lower(s):
            return s if self.case_sensitive else s.lower()

        lex_path = tk.uncached_path(self.lexicon)
        open_func = gzip.open if lex_path.endswith(".gz") else open
        with open_func(lex_path, "rt") as f:
            lex_root = ET.parse(f)
        vocabulary = set([
            maybe_to_lower(o.text.strip() if o.text else "")
            for o in lex_root.findall(".//orth")
        ])

        c = corpus.Corpus()
        c.load(tk.uncached_path(self.corpus))

        def not_only_unknowns(corpus, recording, segment):
            """
            :param Corpus corpus:
            :param Recording recording:
            :param Segment segment:
            :return: whether the orth of segment contains at least one known word
            :rtype: bool
            """
            orth = segment.orth
            if not orth:
                return True
            words = [maybe_to_lower(o) for o in orth.strip().split(" ")]
            return not all(w not in vocabulary for w in words)

        c.filter_segments(not_only_unknowns)
        c.dump(self.out_corpus.get_path())
Example #6
0
    def run(self):
        c = corpus.Corpus()
        c.name = self.name

        with uopen(self.metadata, "rt") as metadata_file:
            for line in metadata_file:
                name, text, processed_text = line.split("|")
                audio_file_path = os.path.join(
                    self.audio_folder.get_path(), name + ".wav"
                )
                assert os.path.isfile(
                    audio_file_path
                ), "Audio file %s was not found in provided audio path %s" % (
                    audio_file_path,
                    self.audio_folder.get_path(),
                )

                recording = corpus.Recording()
                recording.name = name
                recording.audio = audio_file_path
                segment = corpus.Segment()
                segment.orth = processed_text.strip()
                segment.name = name

                wave_info = wave.open(audio_file_path)
                segment.start = 0
                segment.end = wave_info.getnframes() / wave_info.getframerate()
                wave_info.close()

                recording.add_segment(segment)
                c.add_recording(recording)

        c.dump(self.out_bliss_corpus.get_path())
Example #7
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        lex = lexicon.Lexicon()
        lex.load(self.bliss_lexicon.get_path())

        # build lookup dict
        lookup_dict = {}
        for lemma in lex.lemmata:
            for orth in lemma.orth:
                if orth and self.strategy == LexiconStrategy.PICK_FIRST:
                    if len(lemma.phon) > 0:
                        lookup_dict[orth] = lemma.phon[0]

        word_separation_phon = lookup_dict[self.word_separation_orth]
        print("using word separation symbold: %s" % word_separation_phon)
        separator = " %s " % word_separation_phon

        for segment in c.segments():
            try:
                words = [lookup_dict[w] for w in segment.orth.split(" ")]
                segment.orth = separator.join(words)
            except LookupError:
                raise LookupError(
                    "Out-of-vocabulary word detected, please make sure that there are no OOVs remaining by e.g. applying G2P"
                )

        c.dump(self.out_corpus.get_path())
    def run(self):
        self.corpus_object = corpus.Corpus()
        self.corpus_object.load(self.bliss_corpus_file.get_path())
        recordings = list(self.corpus_object.all_recordings())

        print(f"{len(recordings)} recordings detected")
        print(f"launching {self.n_workers} processes")

        tasks = [(r, self.out_audio_path, self.target_length, self.file_extension) for r in recordings]
        with multiprocessing.Pool(processes=self.n_workers) as pool:
            for i, _ in enumerate(pool.imap_unordered(self.cut_file, tasks)):
                if i % 100 == 0:
                    logging.info(f"{i} of {len(tasks)} files done")
Example #9
0
    def run(self):
        if not os.path.isdir(str(self.out_audio_folder)):
            self.sh("mkdir '{audio_out}'")
        c = corpus.Corpus()
        nc = corpus.Corpus()
        segment_file_names = []

        c.load(tk.uncached_path(self.bliss_corpus))
        nc.name = self.corpus_name
        nc.speakers = c.speakers
        nc.default_speaker = c.default_speaker
        nc.speaker_name = c.speaker_name
        # store index of last segment
        for r in c.recordings:
            perturbed_audio_name = "perturbed_" + r.audio.split("/")[-1]

            self.sh(
                "ffmpeg -hide_banner -i '%s' -filter:a \"asetrate={base_frequency}*{speed_factor}\" "
                "-ar {base_frequency} '{audio_out}/%s'" %
                (r.audio, perturbed_audio_name))

            pr = corpus.Recording()
            pr.name = r.name
            pr.segments = r.segments
            pr.speaker_name = r.speaker_name
            pr.speakers = r.speakers
            pr.default_speaker = r.default_speaker
            pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name
            nc.add_recording(pr)
            for s in pr.segments:
                segment_file_names.append(nc.name + "/" + pr.name + "/" +
                                          s.name)
                s.start /= self.speed_factor
                s.end /= self.speed_factor

        nc.dump(str(self.out_corpus))

        with open(str(self.out_segment_file), "w") as segments_outfile:
            segments_outfile.writelines(segment_file_names)
Example #10
0
    def run(self):
        merged_corpus = corpus.Corpus()
        merged_corpus.name = self.name
        for corpus_path in self.bliss_corpora:
            c = corpus.Corpus()
            c.load(tk.uncached_path(corpus_path))
            if self.merge_strategy == MergeStrategy.SUBCORPORA:
                merged_corpus.add_subcorpus(c)
            elif self.merge_strategy == MergeStrategy.FLAT:
                for rec in c.all_recordings():
                    merged_corpus.add_recording(rec)
                merged_corpus.speakers.update(c.speakers)
            elif self.merge_strategy == MergeStrategy.CONCATENATE:
                for subcorpus in c.top_level_subcorpora():
                    merged_corpus.add_subcorpus(subcorpus)
                for rec in c.top_level_recordings():
                    merged_corpus.add_recording(rec)
                for speaker in c.top_level_speakers():
                    merged_corpus.add_speaker(speaker)
            else:
                assert False, "invalid merge strategy"

        merged_corpus.dump(self.out_merged_corpus.get_path())
Example #11
0
    def run(self):
        orth_c = corpus.Corpus()
        orth_c.load(self.reference_bliss_corpus.get_path())

        orths = {}
        for r in orth_c.all_recordings():
            for i in range(len(r.segments)):
                orth = r.segments[i].orth
                tag = r.segments[i].fullname()
                orths[tag] = orth

        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        for r in c.all_recordings():
            for i in range(len(r.segments)):
                tag = r.segments[i].fullname()
                assert tag in orths.keys(), (
                    "Segment %s not found in reference corpus" % s)
                orth = orths[tag]
                r.segments[i].orth = orth

        c.dump(self.out_corpus.get_path())
Example #12
0
    def run(self):
        inf = float("inf")

        def good_duration(corpus, recording, segment):
            l = segment.end - segment.start
            if l == inf:
                return True
            else:
                return l >= self.min_duration and l <= self.max_duration

        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())
        c.filter_segments(good_duration)
        c.dump(self.out_corpus.get_path())
Example #13
0
    def run(self):
        c = libcorpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        words = Counter()
        for s in c.segments():
            words.update(s.orth.strip().split())

        counts = [(v, k) for k, v in words.items()]
        with uopen(self.out_word_counts, "wt") as f:
            f.write(
                "\n".join(
                    "%d\t%s" % t for t in sorted(counts, key=lambda t: (-t[0], t[1]))
                )
            )
Example #14
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        if self.segment_file:
            with uopen(self.segment_file, "rt") as f:
                segments_whitelist = set(l.strip() for l in f.readlines()
                                         if len(l.strip()) > 0)
        else:
            segments_whitelist = None

        with uopen(self.out_txt.get_path(), "wt") as f:
            for segment in c.segments():
                if (not segments_whitelist) or (segment.fullname()
                                                in segments_whitelist):
                    f.write(segment.orth + "\n")
Example #15
0
    def run(self):
        transcriptions = collections.defaultdict(list)
        with open(tk.uncached_path(self.ctm_path), "rt") as f:
            for line in f:
                if line.startswith(";;"):
                    continue

                fields = line.split()
                if 5 <= len(fields) <= 6:
                    recording = fields[0]
                    start = float(fields[2])
                    word = fields[4]
                    transcriptions[recording].append((start, word))

        for recording, times_and_words in transcriptions.items():
            times_and_words.sort()

        corpus_path = tk.uncached_path(self.bliss_corpus)
        c = corpus.Corpus()
        c.load(corpus_path)

        recordings_to_delete = []

        for recording in c.all_recordings():
            times = [s[0] for s in transcriptions[recording.name]]
            words = [s[1] for s in transcriptions[recording.name]]

            if len(words) == 0 and self.remove_empty_segments:
                recordings_to_delete = recording
                continue

            segments_to_delete = []
            for idx, segment in enumerate(recording.segments):
                left_idx = bisect.bisect_left(times, segment.start)
                right_idx = bisect.bisect_left(times, segment.end)

                if left_idx == right_idx and self.remove_empty_segments:
                    segments_to_delete.append(idx)
                    continue

                segment.orth = " ".join(words[left_idx:right_idx]).replace(
                    "&", "&amp;")

            for sidx in reversed(segments_to_delete):
                del recording.segments[sidx]

        c.dump(self.output_corpus_path.get_path())
Example #16
0
    def run(self):
        c = corpus.Corpus()
        c.load(tk.uncached_path(self.corpus_file))

        from multiprocessing import pool

        p = pool.Pool(self.rqmt["cpu"])
        p.map(self._perform_ffmpeg, c.recordings)

        for r in c.recordings:
            audio_filename = self._get_output_filename(r)
            r.audio = os.path.join(self.out_audio_folder.get_path(),
                                   audio_filename)

        if self.recover_duration:
            c.dump("temp_corpus.xml.gz")
        else:
            c.dump(tk.uncached_path(self.out_corpus))
Example #17
0
def run_duration_recover(source_corpus, target_corpus):
    """
  iterates over a single segment bliss corpus and uses the soundfile library to get the actuall recording length

  :param source_corpus:
  :param target_corpus:
  :return:
  """
    c = corpus.Corpus()
    c.load(source_corpus)

    for r in c.all_recordings():
        assert len(r.segments) == 1, "needs to be a single segment recording"
        old_duration = r.segments[0].end
        data, sample_rate = soundfile.read(open(r.audio, "rb"))
        new_duration = len(data) / sample_rate
        print("%s: %f vs. %f" %
              (r.segments[0].name, old_duration, new_duration))
        r.segments[0].end = new_duration

    c.dump(target_corpus)
Example #18
0
    def run_recover_duration(self):
        """
        Open all files with "soundfile" and extract the length information

        :return:
        """
        import soundfile

        c = corpus.Corpus()
        c.load("temp_corpus.xml.gz")

        for r in c.all_recordings():
            assert len(
                r.segments) == 1, "needs to be a single segment recording"
            old_duration = r.segments[0].end
            data, sample_rate = soundfile.read(open(r.audio, "rb"))
            new_duration = len(data) / sample_rate
            logging.info("%s: adjusted from %f to %f seconds" %
                         (r.segments[0].name, old_duration, new_duration))
            r.segments[0].end = new_duration

        c.dump(self.out_corpus.get_path())
Example #19
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        if self.segment_file:
            with uopen(self.segment_file.get_path(), "rt") as f:
                segments_whitelist = set(l.strip() for l in f.readlines()
                                         if len(l.strip()) > 0)
            segment_iterator = filter(
                lambda s: s.fullname() in segments_whitelist, c.segments())
        else:
            segment_iterator = c.segments()

        with uopen(self.text_file, "rt") as f:
            for segment, line in itertools.zip_longest(segment_iterator, f):
                assert (segment is not None
                        ), "there were more text file lines than segments"
                assert line is not None, "there were less text file lines than segments"
                assert len(line) > 0
                segment.orth = line.strip()

        c.dump(self.out_corpus.get_path())
Example #20
0
    def run(self):
        self._get_speakers()
        self._get_transcripts()

        c = corpus.Corpus()
        c.name = os.path.basename(self.corpus_folder.get_path())

        used_speaker_ids = set()  # store which speakers are used

        for transcript in self._transcripts:
            name = "{0}-{1}-{2:04d}".format(transcript["speaker_id"],
                                            transcript["chapter"],
                                            transcript["segment"])
            recording = corpus.Recording()
            recording.name = name
            recording.speaker_name = transcript["speaker_id"]
            recording.audio = "{}/{}.flac".format(transcript["path"], name)

            used_speaker_ids.add(transcript["speaker_id"])

            segment = corpus.Segment()
            segment.name = name
            segment.start = 0
            segment.end = float("inf")
            segment.orth = transcript["orth"].strip()

            recording.segments.append(segment)
            c.recordings.append(recording)

        for speaker_id, speaker_info in sorted(self._speakers.items()):
            if speaker_id not in used_speaker_ids:
                continue
            speaker = corpus.Speaker()
            speaker.name = speaker_id
            speaker.attribs[
                "gender"] = "male" if speaker_info[0] == "M" else "female"
            c.add_speaker(speaker)

        c.dump(self.out_corpus.get_path())
Example #21
0
    def run(self):
        tag_map = {}

        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        all_tags = [("d%d" % i, "default%d" % i,
                     "all other segments of category %d" % i)
                    for i in range(len(self.tag_mapping) + 1)]

        for segment in c.segments():
            tag_map[segment.fullname()] = [
                "d%d" % i for i in range(len(self.tag_mapping) + 1)
            ]

        for i, (tag, segments) in enumerate(self.tag_mapping):
            all_tags.append(tag)
            for file in segments.values():
                for segment in uopen(file):
                    if segment.rstrip() in tag_map:
                        tag_map[segment.rstrip()][i] = tag[0]

        with uopen(self.out_stm_path, "wt") as out:
            for segment in c.segments():
                speaker_name = (segment.speaker().name if segment.speaker()
                                is not None else segment.recording.name)
                segment_track = segment.track + 1 if segment.track else 1
                out.write("%s %d %s %5.2f %5.2f <%s> %s\n" % (
                    segment.recording.name,
                    segment_track,
                    speaker_name,
                    segment.start,
                    segment.end,
                    ",".join(tag_map[segment.fullname()]),
                    segment.orth,
                ))
            for tag in all_tags:
                out.write(';; LABEL "%s" "%s" "%s"\n' % tag)
Example #22
0
 def run(self):
     c = corpus.Corpus()
     c.load(tk.uncached_path(self.bliss_corpus))
     for recording in c.all_recordings():
         recording.audio = gs.file_caching(recording.audio)
     c.dump(tk.uncached_path(self.cached_corpus))
Example #23
0
    def run(self):
        c = corpus.Corpus()
        c.name = "switchboard-1"

        rec_to_segs = self._get_rec_to_segs_map()

        rec_to_speaker = {}
        with uopen(self.speakers_list_file) as f:
            for line in f:
                l = line.strip().split()
                assert len(l) == 3
                assert (l[2] not in rec_to_speaker
                        ), "duplicate recording name: {}?".format(l[2])
                assert l[1] in ["F", "M"]

                # "sw0" prefix is added to match recording names
                rec_to_speaker["sw0" + l[2]] = {
                    "speaker_id": l[0],
                    "gender": {
                        "M": "male",
                        "F": "female"
                    }.get(l[1]),
                }

        # assume unique speaker for each recording with no speaker info
        unk_spk_id = 1
        for rec in sorted(rec_to_segs.keys()):
            if rec not in rec_to_speaker:
                rec_to_speaker[rec] = {
                    "speaker_id": "speaker#" + str(unk_spk_id)
                }
                unk_spk_id += 1

        for rec_name, segs in sorted(rec_to_segs.items()):
            recording = corpus.Recording()
            recording.name = rec_name
            recording.audio = os.path.join(self.audio_dir.get_path(),
                                           rec_name + ".wav")

            assert os.path.exists(
                recording.audio), "recording {} does not exist?".format(
                    recording.audio)

            assert (
                rec_name in rec_to_speaker
            ), "recording {} does not have speaker id?".format(rec_name)
            rec_speaker_id = rec_to_speaker[rec_name]["speaker_id"]

            for seg in segs:
                segment = corpus.Segment()
                segment.name = seg[0]
                segment.start = float(seg[1])
                segment.end = float(seg[2])
                segment.speaker_name = rec_speaker_id
                segment.orth = self._filter_orth(seg[3])
                if len(segment.orth) == 0:
                    continue

                recording.segments.append(segment)
            c.recordings.append(recording)

        # add speakers to corpus
        for speaker_info in rec_to_speaker.values():
            speaker = corpus.Speaker()
            speaker.name = speaker_info["speaker_id"]
            if speaker_info.get("gender", None):
                speaker.attribs["gender"] = speaker_info["gender"]
            c.add_speaker(speaker)

        c.dump(self.out_corpus.get_path())
Example #24
0
    def cut_audio(self):

        c = corpus.Corpus()
        c.load(tk.uncached_path(self.bliss_corpus))

        groups_dict = pickle.load(open("groups.pkl", "rb"))

        empty_recordings = []

        ffmpeg_commands = []

        for recording in c.all_recordings():

            assert len(recording.segments) == 1
            segment = recording.segments[0]
            in_file = recording.audio

            target_file = "_".join(segment.fullname().split("/"))
            if self.output_format:
                target_file += "." + self.output_format
            else:
                target_file += os.path.splitext(in_file)[1]

            target_file = os.path.join(tk.uncached_path(self.out_audio_folder),
                                       target_file)

            groups = groups_dict[segment.fullname()]

            if len(groups) == 0:
                empty_recordings.append(recording)
                continue

            ffmpeg_command = ["ffmpeg", "-y", "-i", in_file, "-filter_complex"]

            split_orth = segment.orth.split(" _ ")
            filter_commands = []

            for i, new_group in enumerate(groups[0]):
                command = "[0]atrim=%.3f:%.3f[g%i]" % (new_group[0],
                                                       new_group[1], i)
                filter_commands.append(command)
            split_orth = split_orth[0].split(" ")
            count = 0
            if (self.silence_symbol != None):
                for i, grp in enumerate(groups[1]):
                    word_id = grp[0] + count
                    duration = (
                        int(grp[1]) /
                        (self.silence_symbol_duration / self.window_shift))
                    if (duration - math.floor(duration) < 0.5):
                        duration = math.floor(duration)
                    else:
                        duration = math.ceil(duration)
                    if duration != 0:
                        split_orth.insert(word_id,
                                          self.silence_symbol * duration)
                        count = count + 1
                segment.orth = " ".join(split_orth)

            filter_command = ";".join(filter_commands)
            filter_command += ";" + "".join([
                "[g%i]" % i for i in range(len(groups[0]))
            ]) + "concat=n=%i:v=0:a=1[out]" % (len(groups[0]))

            ffmpeg_command += [filter_command, "-map", "[out]", target_file]

            print(" ".join(ffmpeg_command))
            ffmpeg_commands.append(ffmpeg_command)

            recording.audio = target_file

        def delete_recordings(c, recordings):
            for subcorpus in c.subcorpora:
                delete_recordings(subcorpus, recordings)
            for r in recordings:
                print("tried to delete empty recording %s" % r.name)
                c.recordings.remove(r)

        delete_recordings(c, empty_recordings)

        c.dump("temp_corpus.xml.gz")

        p = multiprocessing.Pool(processes=4)
        p.map(self.run_subprocess, ffmpeg_commands)
Example #25
0
    def run(self):
        id = os.path.basename(self.job_id())
        if not os.path.isdir(f"/dev/shm/{id}"):
            os.mkdir(f"/dev/shm/{id}")
        c = corpus.Corpus()
        nc = corpus.Corpus()
        segment_file_names = []

        c.load(tk.uncached_path(self.bliss_corpus))
        nc.name = self.corpus_name
        nc.speakers = c.speakers
        nc.default_speaker = c.default_speaker
        nc.speaker_name = c.speaker_name

        logging.info("Random seed used: {}".format(self.seed))
        rng = random.Random(self.seed)

        # store index of last segment
        for r in c.recordings:
            max_seg_end = 0
            for s in r.segments:
                if s.end > max_seg_end:
                    max_seg_end = s.end
            r.max_seg_end = max_seg_end

        # select noise files for each recording
        for i, r in enumerate(c.recordings):
            audio_name = r.audio
            target_length = r.max_seg_end
            reverbed_audio_name = "noised_" + audio_name.split("/")[-1]

            # remove any possibly existing temporary recordings (otherwise ffmpeg will ask for override)
            for p in glob.iglob(f"/dev/shm/{id}/tmp_concat_*.wav"):
                os.unlink(p)

            for n in range(self.n_noise_tracks):
                noise_length = 0
                noise_audios = []

                while noise_length < target_length:
                    random_index = rng.randint(0, len(c.recordings) - 1)
                    while random_index == i:
                        random_index = random.randint(0, len(c.recordings) - 1)
                    noise_audios.append(c.recordings[random_index])
                    noise_length += c.recordings[random_index].max_seg_end

                # create temp noise file
                temp_noise_track_file = "/dev/shm/{id}/tmp_concat_%i.wav" % n

                self.sh(
                    "ffmpeg -hide_banner -loglevel panic -f concat -safe 0 -i <(%s) '%s'"
                    % (
                        " ".join([
                            'echo "file %s";' % f.audio for f in noise_audios
                        ]),
                        temp_noise_track_file,
                    ),
                    except_return_codes=(1, ),
                )

            if self.n_noise_tracks == 1:
                self.sh(
                    "ffmpeg -hide_banner  -i '%s' -i '/dev/shm/{id}/tmp_concat_0.wav' "
                    "-filter_complex '[1]volume=-{snr}dB[a];[0][a]amix=duration=first[out]' "
                    "-map '[out]' '{audio_out}/%s'" %
                    (audio_name, reverbed_audio_name))
            else:
                ffmpeg_head = "ffmpeg -hide_banner  -i '%s' " % audio_name
                noise_inputs = " ".join([
                    "-i '/dev/shm/%s/tmp_concat_%i.wav'" % (id, i)
                    for i in range(self.n_noise_tracks)
                ])
                filter_head = ' -filter_complex "'
                volume_reduction = (";".join([
                    "[%i]volume=-%idB[a%i]" % (i + 1, self.snr, i + 1)
                    for i in range(self.n_noise_tracks)
                ]) + ";")
                mixer = ("[0]" + "".join(
                    ["[a%i]" % i for i in range(1, self.n_noise_tracks + 1)]) +
                         "amix=duration=first:inputs=%i[out]" %
                         (self.n_noise_tracks + 1))
                filter_tail = '" -map "[out]" "{audio_out}/%s"' % reverbed_audio_name
                command = (ffmpeg_head + noise_inputs + filter_head +
                           volume_reduction + mixer + filter_tail)
                self.sh(command)

            nr = corpus.Recording()
            nr.name = r.name
            nr.segments = r.segments
            nr.speaker_name = r.speaker_name
            nr.default_speaker = r.default_speaker
            nr.speakers = r.speakers
            nr.audio = str(self.out_audio_folder) + "/" + reverbed_audio_name
            nc.add_recording(nr)
            for s in nr.segments:
                segment_file_names.append(nc.name + "/" + nr.name + "/" +
                                          s.name + "\n")

        nc.dump(self.out_corpus.get_path())

        with open(tk.uncached_path(self.out_segment_file),
                  "w") as segments_outfile:
            segments_outfile.writelines(segment_file_names)

        shutil.rmtree(f"/dev/shm/{id}")
Example #26
0
    def run(self):
        import h5py

        temp_dir = tempfile.TemporaryDirectory(prefix="hdf_reconstruction_")
        ref_linear_data = h5py.File(self.hdf_file.get_path(), 'r')
        rl_inputs = ref_linear_data['inputs']
        rl_tags = ref_linear_data['seqTags']
        rl_lengths = ref_linear_data['seqLengths']

        n_fft = rl_inputs[0].shape[0]*2
        print("N_FFT from HDF: % i" % n_fft)

        converter = PhaseReconstructor(out_folder=temp_dir.name,
                                       backend=self.backend,
                                       sample_rate=self.sample_rate,
                                       window_shift=self.window_shift,
                                       window_size=self.window_size,
                                       n_fft=n_fft,
                                       iterations=self.iterations,
                                       preemphasis=self.preemphasis,
                                       file_format=self.file_format,
                                       corpus_format="bliss")

        corpus_path = os.path.join(self.out_folder.get_path(), "corpus.xml.gz")
        corpus = bliss_corpus.Corpus()


        # H5py has issues with multithreaded loading, so buffer 512 spectograms
        # single threaded and then distribute to the workers for conversion

        p = multiprocessing.Pool(self.rqmt['cpu'])

        loaded_spectograms = []
        offset = 0
        for tag, length in zip(rl_tags, rl_lengths):
            tag = tag if isinstance(tag, str) else tag.decode()
            loaded_spectograms.append((tag, np.asarray(rl_inputs[offset:offset + length[0]]).T))
            offset += length[0]
            if len(loaded_spectograms) > 512:
                recordings = p.map(converter.convert, loaded_spectograms)

                for recording in recordings:
                    corpus.add_recording(recording)

                # force gc for minimal memory requirement
                del loaded_spectograms
                gc.collect()
                loaded_spectograms = []

        # process rest in the buffer
        if len(loaded_spectograms) > 0:
            recordings = p.map(converter.convert, loaded_spectograms)
            # put all recordings to the corpus
            for recording in recordings:
                corpus.add_recording(recording)

        corpus.name = tag.split("/")[0]
        corpus.dump("corpus.xml")
        replacement_string = "s:%s:%s:g" % (temp_dir.name, self.out_folder.get_path())
        subprocess.call(["sed", "-i", replacement_string, "corpus.xml"])
        subprocess.call(["gzip", "corpus.xml"])
        shutil.move("corpus.xml.gz", self.out_corpus.get_path())

        for path in glob.glob(temp_dir.name + "/*"):
            shutil.move(path, self.out_folder.get_path())
Example #27
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        assert (
            len(c.subcorpora) == 0
        ), "CompressCorpus is not working for corpus files containing subcorpora"

        # for each recording, extract duration
        total_duration = self.add_duration_to_recordings(c)

        # print useful information
        logging.info(f"corpus name {c.name}")
        logging.info(f"number of recordings: {len(c.recordings)}")
        logging.info(f"total duration: {total_duration} sec")

        # determine split
        split_duration = total_duration / float(self.num_splits)
        logging.info(f"split duration: {split_duration} sec")

        # create new compressed corpus file
        cc = corpus.Corpus()
        cc.name = c.name
        cc.speaker_name = c.speaker_name
        cc.speakers = c.speakers
        cc.default_speaker = c.default_speaker

        sm = corpus.SegmentMap()

        # temporary store of recordings
        split_recordings = []
        current_duration = 0
        current_split_index = 0

        # segment count for verification
        segment_count = 0

        for i, recording in enumerate(c.recordings):
            # append recording and its duration to the l
            split_recordings.append(recording)
            current_duration += recording.duration

            # now we have all recordings in the duration for a single file or it is the last recording
            if current_duration > split_duration or i + 1 == len(c.recordings):
                new_recording_element = corpus.Recording()

                split_name = "split_%i" % current_split_index
                logging.info(
                    f"storing split {split_name} with duration {current_duration}"
                )

                new_recording_element.name = split_name
                output_path = os.path.join(self.audio_folder.get_path(),
                                           f"{split_name}.{self.format}")
                new_recording_element.audio = output_path
                current_timestamp = 0

                # store all audio paths that are to be concatenated for a split
                ffmpeg_inputs = []

                for split_recording in split_recordings:
                    recording_name = split_recording.name
                    for j, segment in enumerate(split_recording.segments):
                        # update the segment times based on the current time
                        segment.start = float(
                            segment.start) + current_timestamp

                        # segment ends can be inf, use the duration of the recording in that case
                        if segment.end == "inf":
                            segment.end = split_recording.duration + current_timestamp
                        else:
                            segment.end = float(
                                segment.end) + current_timestamp

                        # add segment keymap entry
                        sm_entry = corpus.SegmentMapItem()
                        # add original name to key
                        sm_entry.key = "/".join(
                            [c.name, recording_name, segment.name])

                        # if a segment has no name, use a 1-based index
                        # of the form corpus_name/split_i/original_recording_name#segment_j
                        # otherwise create entries in the form corpus_name/split_i/original_recording_name#segment_name
                        if segment.name is None:
                            segment.name = recording_name + "#" + str(j + 1)
                        else:
                            segment.name = recording_name + "#" + segment.name

                        # add new name as segment map value
                        sm_entry.value = "/".join(
                            [c.name, split_name, segment.name])
                        sm.map_entries.append(sm_entry)

                        new_recording_element.segments.append(segment)
                        segment_count += 1

                    # update the time stamp with the recording length and add to ffmpeg merge list
                    current_timestamp += split_recording.duration
                    ffmpeg_inputs.append(split_recording.audio)

                # run ffmpeg and add the new recording
                self.run_ffmpeg(ffmpeg_inputs, output_path)
                cc.add_recording(new_recording_element)

                # reset variables
                current_split_index += 1
                split_recordings = []
                current_duration = 0

        logging.info(f"segment count: {segment_count}")
        cc.dump(tk.uncached_path(self.compressed_corpus))

        sm.dump(tk.uncached_path(self.segment_map))