def join_transcript(transcript: Transcript, join_channels: bool = False): joined_transcripts = [] if join_channels: groupped_t = [(channel_missing, transcript)] else: channel_key = lambda t: t.get('channel', channel_missing) groupped_t = itertools.groupby(sorted(transcript, key=channel_key), channel_key) for channel, transcript in groupped_t: transcript = list(transcript) audio_path = transcript[0]['audio_path'] assert all(t['audio_path'] == audio_path for t in transcript) ref = speaker_phrase_separator.join(t['ref'].strip() for t in transcript) speaker = [t['speaker'] for t in transcript] speaker_name = ','.join(collect_speaker_names(transcript)) duration = audio.compute_duration(transcript[0]['audio_path']) joined_transcripts.append( dict(audio_path=audio_path, ref=ref, begin=0.0, end=duration, speaker=speaker, speaker_name=speaker_name, channel=channel)) return joined_transcripts
def compute_duration(t, hours=False): seconds = None if 'begin' in t or 'end' in t: seconds = t.get('end', 0) - t.get('begin', 0) elif 'hyp' in t or 'ref' in t: seconds = max(t_['end'] for k in ['hyp', 'ref'] for t_ in t.get(k, [])) elif 'audio_path' in t: seconds = audio.compute_duration(t['audio_path']) assert seconds is not None return seconds / (60 * 60) if hours else seconds
def __init__( self, data_paths, text_pipelines: typing.List[ language_processing.ProcessingPipeline], sample_rate, frontend=None, speaker_names=None, waveform_transform_debug_dir=None, min_duration=None, max_duration=None, duration_filter=True, min_ref_len=None, max_ref_len=None, max_num_channels=2, ref_len_filter=True, mono=True, audio_dtype='float32', segmented=False, time_padding_multiple=1, audio_backend=None, exclude=set(), join_transcript=False, bucket=None, pop_meta=False, string_array_encoding='utf_16_le', _print=print, debug_short_long_records_features_from_whole_normalized_signal=False ): self.debug_short_long_records_features_from_whole_normalized_signal = debug_short_long_records_features_from_whole_normalized_signal self.join_transcript = join_transcript self.max_duration = max_duration self.text_pipelines = text_pipelines self.frontend = frontend self.sample_rate = sample_rate self.waveform_transform_debug_dir = waveform_transform_debug_dir self.segmented = segmented self.time_padding_multiple = time_padding_multiple self.mono = mono self.audio_backend = audio_backend self.audio_dtype = audio_dtype data_paths = data_paths if isinstance(data_paths, list) else [data_paths] exclude = set(exclude) tic = time.time() transcripts_read = list(map(transcripts.load, data_paths)) _print('Dataset reading time: ', time.time() - tic) tic = time.time() #TODO group only segmented = True segments_by_audio_path = [] for transcript in transcripts_read: transcript = sorted(transcript, key=transcripts.sort_key) transcript = itertools.groupby(transcript, key=transcripts.group_key) for _, example in transcript: segments_by_audio_path.append(list(example)) speaker_names_filtered = set() examples_filtered = [] examples_lens = [] transcript = [] duration = lambda example: sum( map(transcripts.compute_duration, example)) segments_by_audio_path.sort(key=duration) # TODO: not segmented mode may fail if several examples have same audio_path for example in segments_by_audio_path: exclude_ok = ((not exclude) or (transcripts.audio_name(example[0]) not in exclude)) duration_ok = ( (not duration_filter) or (min_duration is None or min_duration <= duration(example)) and (max_duration is None or duration(example) <= max_duration)) if duration_ok and exclude_ok: b = bucket(example) if bucket is not None else 0 for t in example: t['bucket'] = b t['ref'] = t.get('ref', transcripts.ref_missing) t['begin'] = t.get('begin', transcripts.time_missing) t['end'] = t.get('end', transcripts.time_missing) t['channel'] = t.get('channel', transcripts.channel_missing) examples_filtered.append(example) transcript.extend(example) examples_lens.append(len(example)) self.speaker_names = transcripts.collect_speaker_names( transcript, speaker_names=speaker_names or [], num_speakers=max_num_channels, set_speaker=True) _print('Dataset construction time: ', time.time() - tic) tic = time.time() self.bucket = torch.ShortTensor( [e[0]['bucket'] for e in examples_filtered]) self.audio_path = utils.TensorBackedStringArray( [e[0]['audio_path'] for e in examples_filtered], encoding=string_array_encoding) self.ref = utils.TensorBackedStringArray( [t['ref'] for t in transcript], encoding=string_array_encoding) self.begin = torch.DoubleTensor([t['begin'] for t in transcript]) self.end = torch.DoubleTensor([t['end'] for t in transcript]) self.channel = torch.CharTensor([t['channel'] for t in transcript]) self.speaker = torch.LongTensor([t['speaker'] for t in transcript]) self.cumlen = torch.ShortTensor(examples_lens).cumsum( dim=0, dtype=torch.int64) if pop_meta: self.meta = {} else: self.meta = {self.example_id(t): t for t in transcript} if self.join_transcript: #TODO: harmonize dummy transcript of replace_transcript case (and fix channel) self.meta.update({ self.example_id(t_src): t_tgt for e in examples_filtered for t_src, t_tgt in [( dict(audio_path=e[0]['audio_path'], begin=transcripts.time_missing, end=transcripts.time_missing, channel=transcripts.channel_missing, speaker=transcripts.speaker_missing), dict(audio_path=e[0]['audio_path'], begin=0.0, end=audio.compute_duration(e[0]['audio_path'], backend=None), channel=transcripts.channel_missing, speaker=transcripts.speaker_missing, ref=' '.join( filter(bool, [t.get('ref', '') for t in e]))))] }) _print('Dataset tensors creation time: ', time.time() - tic)