Exemple #1
0
def get_file_length(filepath):
    """
    Returns the length of the sequence in the file specified by `filepath`
    """
    signal_info, encoding_info = torchaudio.info(filepath)
    return signal_info.length
Exemple #2
0
    def __getitem__(self, index):
        """

        :return:
        """
        # Check the size of the file
        current_session = self.sessions.iloc[index]

        # TODO is this required ?
        nfo = torchaudio.info(
            f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}"
        )
        original_start = int(current_session['start'])
        if self.overlap > 0:
            lowest_shift = self.overlap / 2
            highest_shift = self.overlap / 2
            if original_start < (current_session['file_start'] *
                                 self.sample_rate + self.sample_number / 2):
                lowest_shift = int(original_start -
                                   current_session['file_start'] *
                                   self.sample_rate)
            if original_start + self.sample_number > (
                    current_session['file_start'] +
                    current_session['file_duration']
            ) * self.sample_rate - self.sample_number / 2:
                highest_shift = int((current_session['file_start'] +
                                     current_session['file_duration']) *
                                    self.sample_rate -
                                    (original_start + self.sample_number))
            start_frame = original_start + int(
                random.uniform(-lowest_shift, highest_shift))
        else:
            start_frame = original_start

        conversion_rate = nfo.sample_rate // self.sample_rate

        if start_frame + conversion_rate * self.sample_number >= nfo.num_frames:
            start_frame = numpy.min(nfo.num_frames -
                                    conversion_rate * self.sample_number - 1)

        speech, speech_fs = torchaudio.load(
            f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}",
            frame_offset=conversion_rate * start_frame,
            num_frames=conversion_rate * self.sample_number)

        if nfo.sample_rate != self.sample_rate:
            speech = torchaudio.transforms.Resample(
                nfo.sample_rate, self.sample_rate).forward(speech)

        speech += 10e-6 * torch.randn(speech.shape)

        if len(self.transform) > 0:
            speech = data_augmentation(speech,
                                       self.sample_rate,
                                       self.transform,
                                       self.transform_number,
                                       noise_df=self.noise_df,
                                       rir_df=self.rir_df)

        speaker_idx = current_session["speaker_idx"]

        if self.output_format == "pytorch":
            return speech, torch.tensor(speaker_idx)
        else:
            return speech, speaker_idx
Exemple #3
0
    def __init__(self, root_dir=''):
        super(Aff2CompDataset, self).__init__()
        self.video_dir = root_dir
        self.extracted_dir = os.path.join(self.video_dir, 'extracted')

        self.clip_len = 8
        self.input_size = (112, 112)
        self.dilation = 6
        self.label_frame = self.clip_len * self.dilation

        # audio params
        self.window_size = 20e-3
        self.window_stride = 10e-3
        self.sample_rate = 44100
        num_fft = 2**math.ceil(math.log2(self.window_size * self.sample_rate))
        window_fn = torch.hann_window

        self.sample_len_secs = 10
        self.sample_len_frames = self.sample_len_secs * self.sample_rate
        self.audio_shift_sec = 5
        self.audio_shift_samples = self.audio_shift_sec * self.sample_rate
        # transforms

        self.audio_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.sample_rate,
            n_mels=64,
            n_fft=num_fft,
            win_length=int(self.window_size * self.sample_rate),
            hop_length=int(self.window_stride * self.sample_rate),
            window_fn=window_fn)

        self.audio_spec_transform = ComposeWithInvert(
            [AmpToDB(), Normalize(mean=[-14.8], std=[19.895])])
        self.clip_transform = ComposeWithInvert([
            NumpyToTensor(),
            Normalize(mean=[0.43216, 0.394666, 0.37645, 0.5],
                      std=[0.22803, 0.22145, 0.216989, 0.225])
        ])

        all_videos = find_all_video_files(self.video_dir)
        self.cached_metadata_path = os.path.join(self.video_dir, 'dataset.pkl')

        if not os.path.isfile(self.cached_metadata_path):
            print('creating cached_metadata... ')
            self.image_path = []  # paths relative to self.extracted_dir
            self.video_id = []
            self.frame_id = []
            self.label_au = []
            self.label_ex = []
            self.label_va = []
            self.train_ids = []
            self.val_ids = []
            self.test_ids = []
            self.features = []
            self.feature_names = []
            self.time_stamps = []
            self.mask_available = False
            self.video_db_nr = []
            video_db_nr = 0
            for video in tqdm(all_videos):
                meta = Video(video).meta
                meta['filename'] = get_filename(video)
                meta['path'] = get_path(video)
                meta['extension'] = get_extension(video)
                num_frames_video = meta['num_frames']
                audio_file = os.path.splitext(video)[0] + '.wav'
                si, ei = torchaudio.info(audio_file)
                assert si.rate == 44100
                video_ts_file = os.path.join(
                    meta['path'], meta['filename'] + '_video_ts.txt')
                if os.path.isfile(video_ts_file):
                    pass
                else:
                    mkvfile = os.path.join(meta['path'], 'temp.mkv')
                    videofile = os.path.join(
                        meta['path'], meta['filename'] + meta['extension'])
                    command = 'mkvmerge -o ' + mkvfile + ' ' + videofile
                    subprocess.call(command, shell=True)
                    command = 'mkvextract ' + mkvfile + ' timestamps_v2 0:' + video_ts_file
                    subprocess.call(command, shell=True)
                    os.remove(mkvfile)
                with open(video_ts_file, 'r') as f:
                    time_stamps = np.genfromtxt(f)[:num_frames_video]
                #os.remove(video_ts_file)
                self.mask_available = True

                extracted_dir = os.path.join(self.extracted_dir,
                                             meta['filename'])

                splits = []
                if 'AU' in meta:
                    au_split = meta['AU']
                    splits.append(au_split)
                if 'EX' in meta:
                    ex_split = meta['EX']
                    splits.append(ex_split)
                if 'VA' in meta:
                    va_split = meta['VA']
                    splits.append(va_split)

                splits = list(set(splits))  # UPDATED 03.06.2020 (was missing)

                for split in splits:
                    self.time_stamps.append(time_stamps)
                    for image_filename in sorted(os.listdir(extracted_dir)):
                        if os.path.isdir(
                                os.path.join(extracted_dir, image_filename)):
                            continue
                        # path relative to self.extracted_dir
                        self.image_path.append(
                            os.path.relpath(
                                os.path.join(extracted_dir, image_filename),
                                self.extracted_dir))
                        self.video_id.append(meta['filename'])
                        self.video_db_nr.append(
                            video_db_nr
                        )  # UPDATED 03.06.2020 (avoids using frames from neighbour videos)
                        frame_id = int(os.path.splitext(image_filename)[0])
                        self.frame_id.append(frame_id)
                        # add your own label loading here if you want to use this for training
                        self.label_au.append(None)
                        self.label_ex.append(None)
                        self.label_va.append(None)
                        self.train_ids.append(1 if split == 'train' else 0)
                        self.val_ids.append(1 if split == 'val' else 0)
                        self.test_ids.append(1 if split == 'test' else 0)
                    video_db_nr += 1

            self.frame_id = np.stack(self.frame_id)
            self.label_au = np.stack(self.label_au)
            self.label_ex = np.stack(self.label_ex)
            self.label_va = np.stack(self.label_va)
            self.train_ids = np.stack(self.train_ids)
            self.val_ids = np.stack(self.val_ids)
            self.test_ids = np.stack(self.test_ids)
            self.time_stamps = np.hstack(self.time_stamps)

            with open(self.cached_metadata_path, 'wb') as f:
                pickle.dump(
                    {
                        'frame_id': self.frame_id,
                        'label_au': self.label_au,
                        'label_ex': self.label_ex,
                        'label_va': self.label_va,
                        'video_id': self.video_id,
                        'image_path': self.image_path,
                        'train_ids': self.train_ids,
                        'val_ids': self.val_ids,
                        'test_ids': self.test_ids,
                        'time_stamps': self.time_stamps,
                        'mask_available': self.mask_available,
                        'video_db_nr': self.video_db_nr
                    }, f)
        else:
            with open(self.cached_metadata_path, 'rb') as f:
                meta = pickle.load(f)
                self.frame_id = meta['frame_id']
                self.label_au = meta['label_au']
                self.label_ex = meta['label_ex']
                self.label_va = meta['label_va']
                self.video_id = meta['video_id']
                self.image_path = meta['image_path']
                self.train_ids = meta['train_ids']
                self.val_ids = meta['val_ids']
                self.time_stamps = meta['time_stamps']
                self.mask_available = meta['mask_available']
                self.test_ids = meta['test_ids']
                self.video_db_nr = meta['video_db_nr']

        self.validation_video_ids()
        self.test_video_ids()
        self.use_mask = self.mask_available
Exemple #4
0
def py_info_func(filepath: str) -> torch.classes.torchaudio.SignalInfo:
    return torchaudio.info(filepath)
Exemple #5
0
def inspect_file(path):
  print("-" * 10)
  print("Source:", path)
  print("-" * 10)
  print(f" - File size: {os.path.getsize(path)} bytes")
  print_metadata(torchaudio.info(path))
Exemple #6
0
def py_info_func(
        filepath: str) -> torchaudio.backend.sox_io_backend.AudioMetaData:
    return torchaudio.info(filepath)
Exemple #7
0
 def d(ID):
     info = torchaudio.info(
         f'/work3/s164419/01005WakeWordData/lectures/{ID}.wav')
     return info.num_frames / info.sample_rate
Exemple #8
0
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Optional[Tuple[str]] = dataset_parts_mini,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, text)
        metadata = {}
        part_path = corpus_dir / part
        for trans_path in part_path.rglob('*.txt'):
            with open(trans_path) as f:
                for line in f:
                    idx, text = line.split(maxsplit=1)
                    audio_path = part_path / Path(idx.replace(
                        '-', '/')).parent / f'{idx}.flac'
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = LibriSpeechMetaData(
                            audio_path=audio_path,
                            audio_info=info[0],
                            text=text)
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.rate),
                      num_samples=metadata[idx].audio_info.length,
                      duration=(metadata[idx].audio_info.length /
                                metadata[idx].audio_info.rate))
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='English',
                               speaker=re.sub(r'-.*', r'', idx),
                               text=metadata[idx].text.strip())
            for idx in audio.recordings)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemple #9
0
def get_file_info(p):
    if isinstance(p, Path): p = p.as_posix()
    signal_info, _ = torchaudio.info(p)
    return signal_info
Exemple #10
0
 def test_get_info(self):
     input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
     info_expected = (1, 64000, 16000, 32)
     info_load = torchaudio.info(input_path)
     self.assertEqual(info_load, info_expected)
Exemple #11
0
    if args.segments is None:
        audio_list = load_wav_scp(args.wav_scp)
    else:
        audio_list = load_wav_segments(args.wav_scp, args.segments)

    count = 0
    with open(args.out_ark, 'wb') as ark_fout, \
         open(args.out_scp, 'w', encoding='utf8') as scp_fout:
        for item in audio_list:
            if len(item) == 2:
                key, wav_path = item
                waveform, sample_rate = torchaudio.load_wav(wav_path)
            else:
                assert len(item) == 4
                key, wav_path, start, end = item
                sample_rate = torchaudio.info(wav_path).sample_rate
                frame_offset = int(start * sample_rate)
                num_frames = int((end - start) * sample_rate)
                waveform, sample_rate = torchaudio.load_wav(
                    wav_path, frame_offset, num_frames)

            mat = kaldi.fbank(waveform,
                              num_mel_bins=args.num_mel_bins,
                              frame_length=args.frame_length,
                              frame_shift=args.frame_shift,
                              dither=args.dither,
                              energy_floor=0.0,
                              sample_frequency=sample_rate)
            mat = mat.detach().numpy()
            kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout)
            count += 1
Exemple #12
0
def prepare_ami(
        data_dir: Pathlike,
        output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param data_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'.
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f'No such directory: {data_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip')
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {
        (filename.split('.')[0], int(filename[-5])): annotations
        for filename, annotations in anotation_lists.items()
    }
    wav_dir = data_dir / 'wav_db'
    audio_paths = wav_dir.rglob('*.wav')
    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby
    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    manifests = defaultdict(dict)

    for part in dataset_parts:
        # Audio
        recordings = []
        for session_name, channel_paths in channel_wavs.items():
            if session_name not in dataset_parts[part]:
                continue
            audio_info = torchaudio.info(str(channel_paths[0]))[0]
            recordings.append(Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[idx],
                        source=str(audio_path)
                    )
                    for idx, audio_path in enumerate(sorted(channel_paths))
                ],
                sampling_rate=int(audio_info.rate),
                num_samples=audio_info.length,
                duration=audio_info.length / audio_info.rate,
            ))
        audio = RecordingSet.from_recordings(recordings)

        # Supervisions
        segments_by_pause = []
        for recording in audio:
            for source in recording.sources:
                # In AMI "source.channels" will always be a one-element list
                channel, = source.channels
                anotation = annotation_by_id_and_channel.get((recording.id, channel))
                if anotation is None:
                    logging.warning(f'No annotation found for recording "{recording.id}" channel {channel} '
                                    f'(file {source.source})')
                    continue
                for seg_idx, seg_info in enumerate(anotation):
                    for subseg_idx, subseg_info in enumerate(seg_info):
                        duration = subseg_info.end_time - subseg_info.begin_time
                        if duration > 0:
                            segments_by_pause.append(SupervisionSegment(
                                id=f'{recording.id}-{seg_idx}-{subseg_idx}',
                                recording_id=recording.id,
                                start=subseg_info.begin_time,
                                duration=duration,
                                channel=channel,
                                language='English',
                                speaker=subseg_info.speaker,
                                gender=subseg_info.gender,
                                text=subseg_info.text
                            ))
        supervision = SupervisionSet.from_segments(segments_by_pause)
        if output_dir is not None:
            audio.to_json(output_dir / f'recordings_{part}.json')
            supervision.to_json(output_dir / f'supervisions_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Exemple #13
0
 def duration(self):
     if self._sample_rate is not None:
         return len(self.data_signal) / self.sample_rate
     else:
         si, ei = torchaudio.info(str(self.path))
         return si.length / si.rate
Exemple #14
0
 def sample_rate(self):
     if not hasattr(self, '_sample_rate') or self._sample_rate is None:
         # Gets metadata from an audio file without loading the signal.
         si, ei = torchaudio.info(str(self.path))
         self._sample_rate = si.rate
     return self._sample_rate
Exemple #15
0
def prepare_ljspeech(
        corpus_dir: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / 'metadata.csv'
    assert metadata_csv_path.is_file(), f'No such file: {metadata_csv_path}'
    metadata = {}
    with open(metadata_csv_path) as f:
        for line in f:
            idx, text, _ = line.split('|')
            audio_path = corpus_dir / 'wavs' / f'{idx}.wav'
            if audio_path.is_file():
                # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                info = torchaudio.info(str(audio_path))
                metadata[idx] = LJSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text)
            else:
                logging.warning(f'No such file: {audio_path}')

    # Audio
    audio = RecordingSet.from_recordings(
        Recording(
            id=idx,
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source=str(metadata[idx].audio_path)
                )
            ],
            sampling_rate=int(metadata[idx].audio_info.rate),
            num_samples=metadata[idx].audio_info.length,
            duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)
        )
        for idx in metadata
    )

    # Supervision
    supervision = SupervisionSet.from_segments(
        SupervisionSegment(
            id=idx,
            recording_id=idx,
            start=0.0,
            duration=audio.recordings[idx].duration,
            channel=0,
            language='English',
            gender='female',
            text=metadata[idx].text
        )
        for idx in audio.recordings
    )

    if output_dir is not None:
        supervision.to_json(output_dir / 'supervisions.json')
        audio.to_json(output_dir / 'audio.json')

    return {'audio': audio, 'supervisions': supervision}
 def duration(self):
     if (self.sig is not None): return self.nsamples / self.sr
     else:
         si, ei = torchaudio.info(str(self.path))
         return si.length / si.rate
Exemple #17
0
def prepare_mobvoihotwords(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    dataset_parts = ['train', 'dev', 'test']
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        metadata = {}
        for prefix in ['p_', 'n_']:
            prefixed_part = prefix + part
            json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json'
            with open(json_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                for entry in json_data:
                    idx = entry['utt_id']
                    speaker = idx if entry['speaker_id'] is None else entry['speaker_id']
                    audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav'
                    text = 'FREETEXT'
                    if entry['keyword_id'] == 0:
                        text = 'HiXiaowen'
                    elif entry['keyword_id'] == 1:
                        text = 'NihaoWenwen'
                    else:
                        assert entry['keyword_id'] == -1
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = MobvoiHotwordsMetaData(
                            audio_path=audio_path, audio_info=info[0], speaker=speaker, text=text
                        )
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(
                id=idx,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[0],
                        source=str(metadata[idx].audio_path)
                    )
                ],
                sampling_rate=int(metadata[idx].audio_info.rate),
                num_samples=metadata[idx].audio_info.length,
                duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)
            )
            for idx in metadata
        )

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=audio.recordings[idx].duration,
                channel=0,
                language='Chinese',
                speaker=metadata[idx].speaker,
                text=metadata[idx].text.strip()
            )
            for idx in audio.recordings
        )

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Exemple #18
0
######################################################################
# Audio I/O
# =========
#
# torchaudio integrates ``libsox`` and provides a rich set of audio I/O.
#

######################################################################
# Quering audio metadata
# ----------------------
#
# ``torchaudio.info`` function fetches metadata of audio. You can provide
# a path-like object or file-like object.
#

metadata = torchaudio.info(SAMPLE_WAV_PATH)
print_metadata(metadata, src=SAMPLE_WAV_PATH)

######################################################################
# Where
#
# -  ``sample_rate`` is the sampling rate of the audio
# -  ``num_channels`` is the number of channels
# -  ``num_frames`` is the number of frames per channel
# -  ``bits_per_sample`` is bit depth
# -  ``encoding`` is the sample coding format
#
# The values ``encoding`` can take are one of the following
#
# -  ``"PCM_S"``: Signed integer linear PCM
# -  ``"PCM_U"``: Unsigned integer linear PCM
Exemple #19
0
def extractLength(couple):
    speaker, locPath = couple
    info = torchaudio.info(str(locPath))[0]
    return info.length
def create_csv(
    orig_tsv_file,
    csv_file,
    data_folder,
    accented_letters=False,
    language="en",
):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    orig_tsv_file : str
        Path to the Common Voice tsv file (standard file).
    data_folder : str
        Path of the CommonVoice dataset.
    accented_letters : bool, optional
        Defines if accented letters will be kept as individual letters or
        transformed to the closest non-accented letters.

    Returns
    -------
    None
    """

    # Check if the given files exists
    if not os.path.isfile(orig_tsv_file):
        msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file)
        logger.info(msg)
        raise FileNotFoundError(msg)

    # We load and skip the header
    loaded_csv = open(orig_tsv_file, "r").readlines()[1:]
    nb_samples = str(len(loaded_csv))

    msg = "Preparing CSV files for %s samples ..." % (str(nb_samples))
    logger.info(msg)

    # Adding some Prints
    msg = "Creating csv lists in %s ..." % (csv_file)
    logger.info(msg)

    csv_lines = [["ID", "duration", "wav", "spk_id", "wrd"]]

    # Start processing lines
    total_duration = 0.0
    for line in tzip(loaded_csv):

        line = line[0]

        # Path is at indice 1 in Common Voice tsv files. And .mp3 files
        # are located in datasets/lang/clips/
        mp3_path = data_folder + "/clips/" + line.split("\t")[1]
        file_name = mp3_path.split(".")[-2].split("/")[-1]
        spk_id = line.split("\t")[0]
        snt_id = file_name

        # Reading the signal (to retrieve duration in seconds)
        if os.path.isfile(mp3_path):
            info = torchaudio.info(mp3_path)
        else:
            msg = "\tError loading: %s" % (str(len(file_name)))
            logger.info(msg)
            continue

        duration = info.num_frames / info.sample_rate
        total_duration += duration

        # Getting transcript
        words = line.split("\t")[2]

        # !! Language specific cleaning !!
        # Important: feel free to specify the text normalization
        # corresponding to your alphabet.

        if language in ["en", "fr", "it", "rw"]:
            words = re.sub("[^'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ]+", " ", words).upper()
        elif language == "ar":
            HAMZA = "\u0621"
            ALEF_MADDA = "\u0622"
            ALEF_HAMZA_ABOVE = "\u0623"
            letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA +
                       ALEF_MADDA + ALEF_HAMZA_ABOVE)
            words = re.sub("[^" + letters + "]+", " ", words).upper()
        elif language == "ga-IE":
            # Irish lower() is complicated, but upper() is nondeterministic, so use lowercase
            def pfxuc(a):
                return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ"

            def galc(w):
                return w.lower(
                ) if not pfxuc(w) else w[0] + "-" + w[1:].lower()

            words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words)
            words = " ".join(map(galc, words.split(" ")))

        # Remove accents if specified
        if not accented_letters:
            nfkd_form = unicodedata.normalize("NFKD", words)
            words = "".join(
                [c for c in nfkd_form if not unicodedata.combining(c)])
            words = words.replace("'", " ")

        # Remove multiple spaces
        words = re.sub(" +", " ", words)

        # Remove spaces at the beginning and the end of the sentence
        words = words.lstrip().rstrip()

        # Getting chars
        chars = words.replace(" ", "_")
        chars = " ".join([char for char in chars][:])

        # Remove too short sentences (or empty):
        if len(words) < 3:
            continue

        # Composition of the csv_line
        csv_line = [snt_id, str(duration), mp3_path, spk_id, str(words)]

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    with open(csv_file, mode="w", encoding="utf-8") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    # Final prints
    msg = "%s successfully created!" % (csv_file)
    logger.info(msg)
    msg = "Number of samples: %s " % (str(len(loaded_csv)))
    logger.info(msg)
    msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2)))
    logger.info(msg)
Exemple #21
0
def extractLength(couple):
    speaker, locPath = couple
    info = torchaudio.info(str(locPath))
    return info.num_frames
Exemple #22
0
 def calc_waveform_length(path: str, sample_rate: int) -> int:
     info, _ = ta.info(path)
     return math.ceil(info.length * sample_rate / info.rate / info.channels)
def create_csv(wav_list, csv_file):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    wav_list : list of str
        The list of wav files.
    csv_file : str
        The path of the output json file
    """

    # Adding some Prints
    msg = f"Creating csv lists in {csv_file} ..."
    logger.info(msg)

    csv_lines = []

    # Start processing lines
    total_duration = 0.0

    # Starting index
    idx = 0

    for wav_file in tzip(wav_list):
        wav_file = wav_file[0]

        path_parts = wav_file.split(os.path.sep)
        file_name, wav_format = os.path.splitext(path_parts[-1])

        # Peeking at the signal (to retrieve duration in seconds)
        if os.path.isfile(wav_file):
            info = torchaudio.info(wav_file)
        else:
            msg = "\tError loading: %s" % (str(len(file_name)))
            logger.info(msg)
            continue

        audio_duration = info.num_frames / info.sample_rate
        total_duration += audio_duration

        # Actual name of the language
        language = path_parts[-4]

        # Create a row with whole utterences
        csv_line = [
            idx,  # ID
            wav_file,  # File name
            wav_format,  # File format
            str(info.num_frames / info.sample_rate),  # Duration (sec)
            language,  # Language
        ]

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

        # Increment index
        idx += 1

    # CSV column titles
    csv_header = ["ID", "wav", "wav_format", "duration", "language"]

    # Add titles to the list at indexx 0
    csv_lines.insert(0, csv_header)

    # Writing the csv lines
    with open(csv_file, mode="w", encoding="utf-8") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    # Final prints
    msg = f"{csv_file} sucessfully created!"
    logger.info(msg)
    msg = f"Number of samples: {len(wav_list)}."
    logger.info(msg)
    msg = f"Total duration: {round(total_duration / 3600, 2)} hours."
    logger.info(msg)
Exemple #24
0
    def test_save(self):
        # load signal
        x, sr = load(self.test_filepath)

        # check save
        new_filepath = os.path.join(self.test_dirpath, "test.wav")
        save(new_filepath, x, sr)
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # check automatic normalization
        x /= 1 << 31
        save(new_filepath, x, sr)
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # test save 1d tensor
        x = x[:, 0]  # get mono signal
        x.squeeze_()  # remove channel dim
        save(new_filepath, x, sr)
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # don't allow invalid sizes as inputs
        with self.assertRaises(ValueError):
            x.unsqueeze_(0)  # N x L not L x N
            save(new_filepath, x, sr)

        with self.assertRaises(ValueError):
            x.squeeze_()
            x.unsqueeze_(1)
            x.unsqueeze_(0)  # 1 x L x 1
            save(new_filepath, x, sr)

        # automatically convert sr from floating point to int
        x.squeeze_(0)
        save(new_filepath, x, float(sr))
        self.assertTrue(os.path.isfile(new_filepath))
        os.unlink(new_filepath)

        # don't save to folders that don't exist
        with self.assertRaises(OSError):
            new_filepath = os.path.join(self.test_dirpath, "no-path",
                                        "test.wav")
            save(new_filepath, x, sr)

        # save created file
        sinewave_filepath = os.path.join(self.test_dirpath, "assets",
                                         "sinewave.wav")
        sr = 16000
        freq = 440
        volume = 0.3

        y = (torch.cos(2 * math.pi * torch.arange(0, 4 * sr).float() * freq /
                       sr))
        y.unsqueeze_(1)
        # y is between -1 and 1, so must scale
        y = (y * volume * 2**31).long()
        save(sinewave_filepath, y, sr)
        self.assertTrue(os.path.isfile(sinewave_filepath))

        # test precision
        new_filepath = os.path.join(self.test_dirpath, "test.wav")
        si, ei = torchaudio.info(sinewave_filepath)
        save(new_filepath, y, sr, precision=16)
        si16, ei16 = torchaudio.info(new_filepath)
        self.assertEqual(si.precision, 32)
        self.assertEqual(si16.precision, 16)
        os.unlink(new_filepath)
Exemple #25
0
    def __getitem__(self, index) -> AudioAndLabels:
        track, audio_paths, tsv_path = self.file_list[index]
        audio = None
        if index < self.max_files_in_memory:
            audio = self.audios[index]

            # The first time the audio needs to be loaded in memory
            if audio is None:
                audio = load_audio(audio_paths, normalize=False)
                self.audios[index] = audio

        labels: Labels = self.labels[index]
        # The first the labels needs to be loaded in memory
        if labels is None:
            labels = self.load_labels(audio_paths, tsv_path)
            self.labels[index] = labels

        audio_length = torchaudio.info(audio_paths[0]).num_frames
        start_frame = None
        end_frame = None
        if self.sequence_length is not None:
            possible_start_interval = audio_length - self.sequence_length
            if self.reproducable_load_sequences:
                step_begin = (
                    int(hashlib.sha256("".join(audio_paths).encode("utf-8")).hexdigest(), 16) % possible_start_interval
                )
            else:
                step_begin = self.random.randint(possible_start_interval)
            step_begin //= HOP_LENGTH

            n_steps = self.sequence_length // HOP_LENGTH
            step_end = step_begin + n_steps

            begin = step_begin * HOP_LENGTH
            end = begin + self.sequence_length
            num_frames = end - begin

            if audio is None:
                audio = load_audio(audio_paths, frame_offset=begin, num_frames=num_frames, normalize=False).to(
                    self.device
                )
            else:
                audio = audio[begin:end].to(self.device)
            label = labels.label[step_begin:step_end, :].to(self.device)
            velocity = labels.velocity[step_begin:step_end, :].to(self.device)

            start_frame = begin
            end_frame  = end
        else:
            if audio is None:
                audio = load_audio(audio_paths, normalize=False).to(self.device)
            else:
                audio = audio.to(self.device)
            label = labels.label.to(self.device)
            velocity = labels.velocity.to(self.device).float()

            start_frame = 0
            end_frame = audio_length

        onset = (label == 4).float()
        frame = (label > 1).float()
        offset = ((label == 1) + (label == 2)).float()
        velocity = velocity.float().div_(128.0)

        return AudioAndLabels(
            track=track,
            start_time=start_frame/SAMPLE_RATE,
            end_time=end_frame/SAMPLE_RATE,
            audio=audio,
            annotation=MusicAnnotation(onset=onset, offset=offset, frame=frame, velocity=velocity),
        )
Exemple #26
0
    def audio_pipeline(
        mix_wav,
    ):  # this is dummy --> it means one epoch will be same as without dynamic mixing
        """
        This audio pipeline defines the compute graph for dynamic mixing
        """

        speakers = np.random.choice(spk_list,
                                    hparams["num_spks"],
                                    replace=False,
                                    p=spk_weights)

        if hparams["use_wham_noise"]:
            noise_file = np.random.choice(noise_files, 1, replace=False)

            noise, fs_read = torchaudio.load(noise_file[0])
            noise = noise.squeeze()

        # select two speakers randomly
        sources = []
        spk_files = [
            np.random.choice(spk_hashtable[spk], 1, False)[0]
            for spk in speakers
        ]

        minlen = min(
            *[torchaudio.info(x).num_frames for x in spk_files],
            hparams["training_signal_len"],
        )

        meter = pyloudnorm.Meter(hparams["sample_rate"])

        MAX_AMP = 0.9
        MIN_LOUDNESS = -33
        MAX_LOUDNESS = -25

        def normalize(signal, is_noise=False):
            """
            This function normalizes the audio signals for loudness
            """
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                c_loudness = meter.integrated_loudness(signal)
                if is_noise:
                    target_loudness = random.uniform(MIN_LOUDNESS - 5,
                                                     MAX_LOUDNESS - 5)
                else:
                    target_loudness = random.uniform(MIN_LOUDNESS,
                                                     MAX_LOUDNESS)
                signal = pyloudnorm.normalize.loudness(signal, c_loudness,
                                                       target_loudness)

                # check for clipping
                if np.max(np.abs(signal)) >= 1:
                    signal = signal * MAX_AMP / np.max(np.abs(signal))

            return torch.from_numpy(signal)

        for i, spk_file in enumerate(spk_files):
            # select random offset
            length = torchaudio.info(spk_file).num_frames
            start = 0
            stop = length
            if length > minlen:  # take a random window
                start = np.random.randint(0, length - minlen)
                stop = start + minlen

            tmp, fs_read = torchaudio.load(
                spk_file,
                frame_offset=start,
                num_frames=stop - start,
            )
            tmp = tmp[0].numpy()
            tmp = normalize(tmp)
            sources.append(tmp)

        sources = torch.stack(sources)
        mixture = torch.sum(sources, 0)
        if hparams["use_wham_noise"]:
            len_noise = len(noise)
            len_mix = len(mixture)
            min_len = min(len_noise, len_mix)
            noise = normalize(noise.numpy(), is_noise=True)
            mixture = mixture[:min_len] + noise[:min_len]

        # check for clipping
        max_amp_insig = mixture.abs().max().item()
        if max_amp_insig > MAX_AMP:
            weight = MAX_AMP / max_amp_insig
        else:
            weight = 1

        sources = weight * sources
        mixture = weight * mixture

        yield mixture
        for i in range(hparams["num_spks"]):
            yield sources[i]

        # If the number of speakers is 2, yield None for the 3rd speaker
        if hparams["num_spks"] == 2:
            yield None

        if hparams["use_wham_noise"]:
            noise = noise * weight
            yield noise
        else:
            yield None
Exemple #27
0
    def __getitem__(self, index):
        """

        :param index:
        :return:
        """
        # Read start and stop and convert to time in seconds
        if self.idmap.start[index] is None:
            start = 0
        else:
            start = int(self.idmap.start[index] * 0.01 * self.sample_rate)

        if self.idmap.stop[index] is None:
            #speech, speech_fs = get_sample(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}", resample=self.sample_rate)
            nfo = torchaudio.info(
                f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}"
            )
            speech, speech_fs = torchaudio.load(
                f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}"
            )
            if nfo.sample_rate != self.sample_rate:
                speech = torchaudio.transforms.Resample(
                    nfo.sample_rate, self.sample_rate).forward(speech)
            duration = int(speech.shape[1] - start)
            start = 0
            stop = speech.shape[1]
        else:
            # TODO Check if that code is still relevant with torchaudio.load() in case of sample_rate mismatch
            nfo = torchaudio.info(
                f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}"
            )
            assert nfo.sample_rate == self.sample_rate
            conversion_rate = nfo.sample_rate // self.sample_rate
            start = start
            stop = (int(self.idmap.stop[index] * 0.01 * self.sample_rate) -
                    start)
            # add this in case the segment is too short
            if stop - start <= self.min_duration * self.sample_rate:
                middle = start + (stop - start) // 2
                start = max(
                    0,
                    int(middle - (self.min_duration * self.sample_rate / 2)))
                duration = int(self.min_duration * self.sample_rate)

            speech, speech_fs = torchaudio.load(
                f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}",
                frame_offset=start * conversion_rate,
                num_frames=duration * conversion_rate)
            speech = torchaudio.transforms.Resample(
                nfo.sample_rate, self.sample_rate).forward(speech)

        speech += 10e-6 * torch.randn(speech.shape)

        if self.sliding_window:
            speech = speech.squeeze().unfold(0, self.window_len,
                                             self.window_shift)
            middle_points = numpy.arange(
                start + self.window_len / 2,
                start + duration - self.window_len / 2, self.window_shift)
            starts = middle_points - self.window_shift / 2
            stops = middle_points + self.window_shift / 2
            starts[0] = start
            start = starts
            stops[-1] = start + duration
        else:
            stop = start + duration

        if len(self.transformation.keys()) > 0:
            speech = data_augmentation(speech,
                                       speech_fs,
                                       self.transformation,
                                       self.transform_number,
                                       noise_df=self.noise_df,
                                       rir_df=self.rir_df)

        if self.backward:
            speech = torch.flip(speech, [0, 1]).squeeze()
        else:
            speech = speech.squeeze()

        return speech, self.idmap.leftids[index], self.idmap.rightids[
            index], start, stop
Exemple #28
0
def get_dataset_fast_api_version(file_location):
    files = []
    siginfo, _ = torchaudio.info(file_location)
    length = siginfo.length // siginfo.channels
    files.append((file_location, length))
    return Audioset(files, with_path=True, sample_rate=sample_rate)
Exemple #29
0
import torchaudio
import matplotlib.pyplot as plt
from pathlib import Path

# p = Path('Supercharger_Blockiergebuehr_Tesla_Fordert_Geld_V.mp3')
#
# print(p.exists())

filename = "../noise_cancellation/Supercharger_Blockiergebühr_Tesla_Fordert_Geld_V_25sec.mp3"
print(torchaudio.info(filename))

waveform, sample_rate = torchaudio.load(filename)

print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.figure()
plt.plot(waveform.t().numpy())
plt.show()

specgram = torchaudio.transforms.Spectrogram()(waveform)

print("Shape of spectrogram: {}".format(specgram.size()))

plt.figure()
plt.imshow(specgram.log2()[0, :, :].numpy(), cmap='gray')
plt.show()

specgram = torchaudio.transforms.MelSpectrogram()(waveform)

print("Shape of spectrogram: {}".format(specgram.size()))
Exemple #30
0
    def __init__(self,
                 root_dir,
                 subset="train",
                 length=16384,
                 preload=False,
                 half=True,
                 use_soundfile=False):
        """
        Args:
            root_dir (str): Path to the root directory of the SignalTrain dataset.
            subset (str, optional): Pull data either from "train", "val", or "test" subsets. (Default: "train")
            length (int, optional): Number of samples in the returned examples. (Default: 40)
            preload (bool, optional): Read in all data into RAM during init. (Default: False)
            half (bool, optional): Store the float32 audio as float16. (Default: True)
            use_soundfile (bool, optional): Use the soundfile library to load instead of torchaudio. (Default: False)
        """
        self.root_dir = root_dir
        self.subset = subset
        self.length = length
        self.preload = preload
        self.half = half
        self.use_soundfile = use_soundfile

        # get all the target files files in the directory first
        self.target_files = glob.glob(
            os.path.join(self.root_dir, self.subset.capitalize(),
                         "target_*.wav"))
        self.input_files = glob.glob(
            os.path.join(self.root_dir, self.subset.capitalize(),
                         "input_*.wav"))

        self.examples = []
        self.hours = 0  # total number of hours of data in the subset

        # ensure that the sets are ordered correctlty
        self.target_files.sort()
        self.input_files.sort()

        # get the parameters
        self.params = [(float(f.split("__")[1].replace(".wav", "")),
                        float(f.split("__")[2].replace(".wav", "")))
                       for f in self.target_files]

        # loop over files to count total length
        for idx, (tfile, ifile, params) in enumerate(
                zip(self.target_files, self.input_files, self.params)):

            ifile_id = int(os.path.basename(ifile).split("_")[1])
            tfile_id = int(os.path.basename(tfile).split("_")[1])
            if ifile_id != tfile_id:
                raise RuntimeError(
                    f"Found non-matching file ids: {ifile_id} != {tfile_id}! Check dataset."
                )

            md = torchaudio.info(tfile)
            self.hours += (md.num_frames / md.sample_rate) / 3600
            num_frames = md.num_frames

            if self.preload:
                sys.stdout.write(
                    f"* Pre-loading... {idx+1:3d}/{len(self.target_files):3d} ...\r"
                )
                sys.stdout.flush()
                input, sr = self.load(ifile)
                target, sr = self.load(tfile)

                num_frames = int(np.min([input.shape[-1], target.shape[-1]]))
                if input.shape[-1] != target.shape[-1]:
                    print(os.path.basename(ifile), input.shape[-1],
                          os.path.basename(tfile), target.shape[-1])
                    raise RuntimeError("Found potentially corrupt file!")
                if self.half:
                    input = input.half()
                    target = target.half()
            else:
                input = None
                target = None

            # create one entry for each patch
            for n in range((num_frames // self.length) - 1):
                offset = int(n * self.length)
                end = offset + self.length
                self.examples.append({
                    "idx":
                    idx,
                    "target_file":
                    tfile,
                    "input_file":
                    ifile,
                    "input_audio":
                    input[:, offset:end] if input is not None else None,
                    "target_audio":
                    target[:, offset:end] if input is not None else None,
                    "params":
                    params,
                    "offset":
                    offset,
                    "frames":
                    num_frames
                })

        # we then want to get the input files
        print(
            f"Located {len(self.examples)} examples totaling {self.hours:0.1f} hr in the {self.subset} subset."
        )