def extract_alignment(self, alignment_path):
     if alignment_path.endswith(".bundle"):
         files = open(alignment_path, "rt")
         for cache in files:
             sprint_cache = FileArchive(cache.strip())
             sprint_cache.setAllophones(
                 tk.uncached_path(self.allophone_file))
             keys = [
                 str(s) for s in sprint_cache.ft
                 if not str(s).endswith(".attribs")
             ]
             for key in keys:
                 # only exctract time and mix, the HMM state is not needed
                 alignment = [[a[0], a[1], sprint_cache.allophones[a[1]]]
                              for a in sprint_cache.read(key, 'align')]
                 yield (key, alignment)
     else:
         sprint_cache = FileArchive(alignment_path)
         sprint_cache.setAllophones(tk.uncached_path(self.allophone_file))
         keys = [
             str(s) for s in sprint_cache.ft
             if not str(s).endswith(".attribs")
         ]
         for key in keys:
             # only exctract time and mix, the HMM state is not needed
             alignment = [[a[0], a[1], sprint_cache.allophones[a[1]]]
                          for a in sprint_cache.read(key, 'align')]
             yield (key, alignment)
    def run(self):
        c = corpus.Corpus()
        nc = corpus.Corpus()

        c.load(tk.uncached_path(self.corpus_file))
        nc.name = c.name
        nc.speakers = c.speakers
        nc.default_speaker = c.default_speaker
        nc.speaker_name = c.speaker_name
        # store index of last segment
        for r in c.recordings:
            nr = corpus.Recording()
            nr.name = r.name
            nr.segments = r.segments
            nr.speaker_name = r.speaker_name
            nr.speakers = r.speakers
            nr.default_speaker = r.default_speaker

            audio_name = r.audio.split("/")[-1]

            if self.output_format is not None:
                name, ext = os.path.splitext(audio_name)
                audio_name = name + "." + self.output_format

            nr.audio = os.path.join(tk.uncached_path(self.audio_folder),
                                    audio_name)
            nc.add_recording(nr)

        from multiprocessing import pool
        p = pool.Pool(4)
        p.map(self.perform_ffmpeg, c.recordings)

        nc.dump(tk.uncached_path(self.out))
  def create_files(self):
    # returnn
    shutil.copy(tk.uncached_path(self.returnn_config_file_in),
                tk.uncached_path(self.returnn_config_file))

    parameter_list = self.get_parameter_list()

    with open('rnn.sh', 'wt') as f:
      f.write('#!/usr/bin/env bash\n%s' % ' '.join([tk.uncached_path(self.returnn_python_exe), os.path.join(tk.uncached_path(self.returnn_root), 'rnn.py'), self.returnn_config_file.get_path()] + parameter_list))
    os.chmod('rnn.sh', stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
    def run(self):
        import soundfile
        c = corpus.Corpus()
        c.load(tk.uncached_path(self.bliss_corpus))

        for r in c.all_recordings():
            assert len(
                r.segments) == 1, "needs to be a single segment recording"
            old_duration = r.segments[0].end
            data, sample_rate = soundfile.read(open(r.audio, "rb"))
            new_duration = len(data) / sample_rate
            print("%s: %f vs. %f" %
                  (r.segments[0].name, old_duration, new_duration))
            r.segments[0].end = new_duration

        c.dump(tk.uncached_path(self.out))
Example #5
0
 def run(self):
     d = eval(open(self.search_py_output.get_path(), "r").read())
     assert isinstance(d, dict)  # seq_tag -> bpe string
     assert not os.path.exists(self.out_word_search_results.get_path())
     with open(tk.uncached_path(self.out_word_search_results), "w") as out:
         out.write("{\n")
         for seq_tag, txt in sorted(d.items()):
             if "#" in seq_tag:
                 tag_split = seq_tag.split("/")
                 recording_name, segment_name = tag_split[2].split("#")
                 seq_tag = tag_split[
                     0] + "/" + recording_name + "/" + segment_name
             out.write(
                 "%r: %r,\n" %
                 (seq_tag, txt.replace(" ", "").replace("▁", " ").strip()))
         out.write("}\n")
    def perform_ffmpeg(self, r):
        audio_name = r.audio.split("/")[-1]

        if self.output_format is not None:
            name, ext = os.path.splitext(audio_name)
            audio_name = name + "." + self.output_format

        target = tk.uncached_path(self.audio_folder) + "/" + audio_name
        seconds = None
        if not os.path.exists(target):
            result = self.sh("%s -hide_banner -y -i %s %s {audio_folder}/%s" %
                             (self.ffmpeg_binary, r.audio,
                              self.ffmpeg_option_string, audio_name),
                             include_stderr=True)
        else:
            print("found %s" % target)
        return seconds
  def get_parameter_list(self):
    parameter_list = []
    for k, v in sorted(self.parameter_dict.items()):
      if isinstance(v, tk.Variable):
        v = str(v.get())
      elif isinstance(v, tk.Path):
        v = tk.uncached_path(v)
      elif isinstance(v, list):
        v = "\"%s\"" % str(v).replace(" ", "")
      else:
        v = str(v)

      if v.startswith("-"):
        v = "-- " + v

      parameter_list.append("++%s" % k)
      parameter_list.append(v)

    return parameter_list
 def recover_duration(self):
     run_duration_recover("temp_corpus.xml.gz",
                          tk.uncached_path(self.out_corpus))
    def cut_audio(self):

        c = corpus.Corpus()
        c.load(tk.uncached_path(self.bliss_corpus))

        groups_dict = pickle.load(open("groups.pkl", "rb"))

        empty_recordings = []

        ffmpeg_commands = []

        for recording in c.all_recordings():

            assert len(recording.segments) == 1
            segment = recording.segments[0]
            in_file = recording.audio

            target_file = "_".join(segment.fullname().split("/"))
            if self.output_format:
                target_file += "." + self.output_format
            else:
                target_file += os.path.splitext(in_file)[1]

            target_file = os.path.join(tk.uncached_path(self.out_audio_folder),
                                       target_file)

            groups = groups_dict[segment.fullname()]

            if len(groups) == 0:
                empty_recordings.append(recording)
                continue

            ffmpeg_command = ["ffmpeg", "-y", "-i", in_file, "-filter_complex"]

            split_orth = segment.orth.split(" _ ")
            filter_commands = []

            for i, new_group in enumerate(groups[0]):
                command = "[0]atrim=%.3f:%.3f[g%i]" % (new_group[0],
                                                       new_group[1], i)
                filter_commands.append(command)
            split_orth = split_orth[0].split(" ")
            count = 0
            if (self.silence_symbol != None):
                for i, grp in enumerate(groups[1]):
                    word_id = grp[0] + count
                    duration = (
                        int(grp[1]) /
                        (self.silence_symbol_duration / self.window_shift))
                    if (duration - math.floor(duration) < 0.5):
                        duration = math.floor(duration)
                    else:
                        duration = math.ceil(duration)
                    if duration != 0:
                        split_orth.insert(word_id,
                                          self.silence_symbol * duration)
                        count = count + 1
                segment.orth = " ".join(split_orth)

            filter_command = ";".join(filter_commands)
            filter_command += ";" + "".join([
                "[g%i]" % i for i in range(len(groups[0]))
            ]) + "concat=n=%i:v=0:a=1[out]" % (len(groups[0]))

            ffmpeg_command += [filter_command, "-map", "[out]", target_file]

            print(" ".join(ffmpeg_command))
            ffmpeg_commands.append(ffmpeg_command)

            recording.audio = target_file

        def delete_recordings(c, recordings):
            for subcorpus in c.subcorpora:
                delete_recordings(subcorpus, recordings)
            for r in recordings:
                print("tried to delete empty recording %s" % r.name)
                c.recordings.remove(r)

        delete_recordings(c, empty_recordings)

        c.dump("temp_corpus.xml.gz")

        p = multiprocessing.Pool(processes=4)
        p.map(self.run_subprocess, ffmpeg_commands)
Example #10
0
    def extract_silence(self):
        """
        TODO: fix the high memory consumption
        :return:
        """
        alignment_path = tk.uncached_path(self.alignment_cache)

        groups_dict = {}
        for key, cache in self.extract_alignment(alignment_path):
            length = len(cache)
            indices = numpy.asarray(
                [numpy.minimum(1, entry[1]) for entry in cache])
            word_tokens = []
            for i in range(length):
                word_tokens.append(cache[i][2].split("}")[-1])

            words = 0
            silence_duration = 0
            in_word = False
            in_silence = False
            silence_word_positions = []

            groups = []

            in_group = bool(indices[0])
            group_start = 0
            group_end = 0

            for i, (speech, word_token) in enumerate(zip(indices,
                                                         word_tokens)):
                # dealing with word tokens
                assert word_token in ['', '@i', '@f', '@i@f']
                if word_token == "@i" and in_word == False and in_silence == True:
                    in_word = True
                    in_silence = False
                    words += 1
                if word_token == "@i" and in_word == False:
                    in_word = True
                if word_token == "@i" and silence_duration > 0:
                    # clip the silence duration to the maximum number of frames we allow, e.g. 500ms pause / 10ms shift = 50 frames
                    # after word 12 there are 80 frames silence -> clip to 50 frames
                    silence_word_positions.append(
                        (words,
                         numpy.minimum(silence_duration, self.pause_duration /
                                       self.window_shift)))
                    silence_duration = 0

                if in_word and word_token == "@f":
                    words += 1
                    in_word = False
                if word_token == "@i@f" and speech == 0:
                    silence_duration += 1
                if word_token == "@i@f" and speech == 1:
                    in_silence = True

                # dealing with speech/silence
                if not in_group and speech == 1:
                    if (group_start == 0 and group_end == 0):
                        group_start = i * self.window_shift
                        in_group = True
                    elif (i * self.window_shift -
                          group_end) > self.pause_duration:
                        group_end = group_end + (self.pause_duration) / 2
                        groups.append((group_start, group_end))
                        group_start = i * self.window_shift - (
                            self.pause_duration) / 2
                        in_group = True
                    else:
                        in_group = True
                if in_group and speech == 0:
                    group_end = i * self.window_shift
                    in_group = False

            if (group_start < group_end):
                groups.append((group_start, group_end))
            if (group_start > group_end):
                group_end = group_start + self.window_shift
                groups.append((group_start, group_end))

            # store groups and silence, drop the first silence word position
            # as we never have silence in the beginning
            groups_dict[key] = [groups, silence_word_positions[1:]]

        pickle.dump(groups_dict, open("groups.pkl", "wb"))