예제 #1
0
def join_transcript(transcript: Transcript, join_channels: bool = False):
    joined_transcripts = []

    if join_channels:
        groupped_t = [(channel_missing, transcript)]
    else:
        channel_key = lambda t: t.get('channel', channel_missing)
        groupped_t = itertools.groupby(sorted(transcript, key=channel_key),
                                       channel_key)

    for channel, transcript in groupped_t:
        transcript = list(transcript)
        audio_path = transcript[0]['audio_path']
        assert all(t['audio_path'] == audio_path for t in transcript)
        ref = speaker_phrase_separator.join(t['ref'].strip()
                                            for t in transcript)
        speaker = [t['speaker'] for t in transcript]
        speaker_name = ','.join(collect_speaker_names(transcript))
        duration = audio.compute_duration(transcript[0]['audio_path'])
        joined_transcripts.append(
            dict(audio_path=audio_path,
                 ref=ref,
                 begin=0.0,
                 end=duration,
                 speaker=speaker,
                 speaker_name=speaker_name,
                 channel=channel))
    return joined_transcripts
예제 #2
0
def compute_duration(t, hours=False):
    seconds = None

    if 'begin' in t or 'end' in t:
        seconds = t.get('end', 0) - t.get('begin', 0)
    elif 'hyp' in t or 'ref' in t:
        seconds = max(t_['end'] for k in ['hyp', 'ref'] for t_ in t.get(k, []))
    elif 'audio_path' in t:
        seconds = audio.compute_duration(t['audio_path'])

    assert seconds is not None

    return seconds / (60 * 60) if hours else seconds
예제 #3
0
    def __init__(
            self,
            data_paths,
            text_pipelines: typing.List[
                language_processing.ProcessingPipeline],
            sample_rate,
            frontend=None,
            speaker_names=None,
            waveform_transform_debug_dir=None,
            min_duration=None,
            max_duration=None,
            duration_filter=True,
            min_ref_len=None,
            max_ref_len=None,
            max_num_channels=2,
            ref_len_filter=True,
            mono=True,
            audio_dtype='float32',
            segmented=False,
            time_padding_multiple=1,
            audio_backend=None,
            exclude=set(),
            join_transcript=False,
            bucket=None,
            pop_meta=False,
            string_array_encoding='utf_16_le',
            _print=print,
            debug_short_long_records_features_from_whole_normalized_signal=False
    ):
        self.debug_short_long_records_features_from_whole_normalized_signal = debug_short_long_records_features_from_whole_normalized_signal
        self.join_transcript = join_transcript
        self.max_duration = max_duration
        self.text_pipelines = text_pipelines
        self.frontend = frontend
        self.sample_rate = sample_rate
        self.waveform_transform_debug_dir = waveform_transform_debug_dir
        self.segmented = segmented
        self.time_padding_multiple = time_padding_multiple
        self.mono = mono
        self.audio_backend = audio_backend
        self.audio_dtype = audio_dtype

        data_paths = data_paths if isinstance(data_paths,
                                              list) else [data_paths]
        exclude = set(exclude)

        tic = time.time()

        transcripts_read = list(map(transcripts.load, data_paths))
        _print('Dataset reading time: ',
               time.time() - tic)
        tic = time.time()

        #TODO group only segmented = True
        segments_by_audio_path = []
        for transcript in transcripts_read:
            transcript = sorted(transcript, key=transcripts.sort_key)
            transcript = itertools.groupby(transcript,
                                           key=transcripts.group_key)
            for _, example in transcript:
                segments_by_audio_path.append(list(example))

        speaker_names_filtered = set()
        examples_filtered = []
        examples_lens = []
        transcript = []

        duration = lambda example: sum(
            map(transcripts.compute_duration, example))
        segments_by_audio_path.sort(key=duration)

        # TODO: not segmented mode may fail if several examples have same audio_path
        for example in segments_by_audio_path:
            exclude_ok = ((not exclude) or
                          (transcripts.audio_name(example[0]) not in exclude))
            duration_ok = (
                (not duration_filter) or
                (min_duration is None or min_duration <= duration(example)) and
                (max_duration is None or duration(example) <= max_duration))

            if duration_ok and exclude_ok:
                b = bucket(example) if bucket is not None else 0
                for t in example:
                    t['bucket'] = b
                    t['ref'] = t.get('ref', transcripts.ref_missing)
                    t['begin'] = t.get('begin', transcripts.time_missing)
                    t['end'] = t.get('end', transcripts.time_missing)
                    t['channel'] = t.get('channel',
                                         transcripts.channel_missing)

                examples_filtered.append(example)
                transcript.extend(example)
                examples_lens.append(len(example))

        self.speaker_names = transcripts.collect_speaker_names(
            transcript,
            speaker_names=speaker_names or [],
            num_speakers=max_num_channels,
            set_speaker=True)

        _print('Dataset construction time: ',
               time.time() - tic)
        tic = time.time()

        self.bucket = torch.ShortTensor(
            [e[0]['bucket'] for e in examples_filtered])
        self.audio_path = utils.TensorBackedStringArray(
            [e[0]['audio_path'] for e in examples_filtered],
            encoding=string_array_encoding)
        self.ref = utils.TensorBackedStringArray(
            [t['ref'] for t in transcript], encoding=string_array_encoding)
        self.begin = torch.DoubleTensor([t['begin'] for t in transcript])
        self.end = torch.DoubleTensor([t['end'] for t in transcript])
        self.channel = torch.CharTensor([t['channel'] for t in transcript])
        self.speaker = torch.LongTensor([t['speaker'] for t in transcript])
        self.cumlen = torch.ShortTensor(examples_lens).cumsum(
            dim=0, dtype=torch.int64)
        if pop_meta:
            self.meta = {}
        else:
            self.meta = {self.example_id(t): t for t in transcript}
            if self.join_transcript:
                #TODO: harmonize dummy transcript of replace_transcript case (and fix channel)
                self.meta.update({
                    self.example_id(t_src): t_tgt
                    for e in examples_filtered for t_src, t_tgt in [(
                        dict(audio_path=e[0]['audio_path'],
                             begin=transcripts.time_missing,
                             end=transcripts.time_missing,
                             channel=transcripts.channel_missing,
                             speaker=transcripts.speaker_missing),
                        dict(audio_path=e[0]['audio_path'],
                             begin=0.0,
                             end=audio.compute_duration(e[0]['audio_path'],
                                                        backend=None),
                             channel=transcripts.channel_missing,
                             speaker=transcripts.speaker_missing,
                             ref=' '.join(
                                 filter(bool, [t.get('ref', '')
                                               for t in e]))))]
                })

        _print('Dataset tensors creation time: ', time.time() - tic)