コード例 #1
0
ファイル: test_torchaudio.py プロジェクト: underdogliu/lhotse
def test_augmentation_chain_randomized(
    recording: Recording,
    rir: Recording,
    target_sampling_rate: int,
    sp_factor: float,
    vp_factor: float,
    reverb: bool,
    resample_first: bool,
    cut_duration: Seconds,
):
    if resample_first:
        recording_aug = (recording.resample(target_sampling_rate).
                         perturb_speed(sp_factor).perturb_volume(vp_factor))
    else:
        recording_aug = (recording.perturb_speed(sp_factor).resample(
            target_sampling_rate).perturb_volume(vp_factor))
    if reverb:
        recording_aug = recording_aug.reverb_rir(rir)

    audio_aug = recording_aug.load_audio()
    assert audio_aug.shape[1] == recording_aug.num_samples

    cut_aug = MonoCut(
        id="dummy",
        start=0.5125,
        duration=cut_duration,
        channel=0,
        recording=recording_aug,
    )
    assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
コード例 #2
0
def test_cut_load_custom_recording_pad_left():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = MonoCut(
            id="x",
            start=0,
            duration=duration,
            channel=0,
            recording=dummy_recording(0, duration=duration),
        )
        cut.my_favorite_song = recording

        cut_pad = cut.pad(duration=60.0, direction="left")

        restored_audio = cut_pad.load_my_favorite_song()
        assert restored_audio.shape == (1, 960000)  # 16000 * 60

        np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]])
        np.testing.assert_almost_equal(audio, restored_audio[:,
                                                             -audio.shape[1]:])
コード例 #3
0
 def test_cut_speakers_audio_mask(self, supervisions, alignment):
     cut = MonoCut(
         "cut",
         start=0,
         duration=2,
         channel=0,
         recording=Mock(sampling_rate=16000),
         supervisions=supervisions,
     )
     mask = cut.speakers_audio_mask(use_alignment_if_exists=alignment)
     if alignment == "word":
         ones = [
             np.index_exp[list(chain(range(0, 1600), range(3200, 6400)))],
             np.index_exp[list(chain(range(9600, 12800)))],
         ]
         zeros = [
             np.index_exp[list(chain(range(1600, 3200), range(6400, 32000)))],
             np.index_exp[list(chain(range(0, 9600), range(12800, 32000)))],
         ]
     else:
         ones = [np.index_exp[range(0, 8000)], np.index_exp[range(9600, 12800)]]
         zeros = [
             np.index_exp[list(chain(range(8000, 32000)))],
             np.index_exp[list(chain(range(0, 9600), range(12800, 32000)))],
         ]
     assert (mask[0, ones[0]] == 1).all()
     assert (mask[1, ones[1]] == 1).all()
     assert (mask[0, zeros[0]] == 0).all()
     assert (mask[1, zeros[1]] == 0).all()
コード例 #4
0
 def test_mixed_cut_audio_mask(self):
     cut = MonoCut(
         "cut", start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000)
     )
     mixed_cut = cut.append(cut)
     mask = mixed_cut.supervisions_audio_mask()
     assert mask.sum() == 0
コード例 #5
0
 def with_cut(self,
              sampling_rate: int,
              num_samples: int,
              features: bool = True,
              supervision: bool = False,
              alignment: bool = False,
              frame_shift: Seconds = 0.01) -> MonoCut:
     duration = num_samples / sampling_rate
     cut = MonoCut(
         id=str(uuid4()),
         start=0,
         duration=duration,
         channel=0,
         recording=self.with_recording(sampling_rate=sampling_rate,
                                       num_samples=num_samples))
     if features:
         cut = self._with_features(cut, frame_shift=frame_shift)
     if supervision:
         cut.supervisions.append(
             SupervisionSegment(
                 id=f'sup-{cut.id}',
                 recording_id=cut.recording_id,
                 start=0,
                 duration=cut.duration,
                 text='irrelevant',
                 alignment=self._with_alignment(cut, 'irrelevant')
                 if alignment else None))
     return cut
コード例 #6
0
 def test_cut_speakers_features_mask(self, supervisions, alignment):
     cut = MonoCut(
         "cut",
         start=0,
         duration=2,
         channel=0,
         features=Mock(sampling_rate=16000, frame_shift=0.01, num_frames=2000),
         supervisions=supervisions,
     )
     mask = cut.speakers_feature_mask(use_alignment_if_exists=alignment)
     if alignment == "word":
         ones = [
             np.index_exp[list(chain(range(0, 10), range(20, 40)))],
             np.index_exp[list(chain(range(60, 80)))],
         ]
         zeros = [
             np.index_exp[list(chain(range(10, 20), range(40, 200)))],
             np.index_exp[list(chain(range(0, 60), range(80, 200)))],
         ]
     else:
         ones = [
             np.index_exp[list(chain(range(0, 50)))],
             np.index_exp[list(chain(range(60, 80)))],
         ]
         zeros = [
             np.index_exp[list(chain(range(50, 200)))],
             np.index_exp[list(chain(range(0, 60), range(80, 200)))],
         ]
     assert (mask[0, ones[0]] == 1).all()
     assert (mask[1, ones[1]] == 1).all()
     assert (mask[0, zeros[0]] == 0).all()
     assert (mask[1, zeros[1]] == 0).all()
コード例 #7
0
ファイル: fixtures.py プロジェクト: underdogliu/lhotse
 def with_cut(
     self,
     sampling_rate: int,
     num_samples: int,
     features: bool = True,
     supervision: bool = False,
     alignment: bool = False,
     custom_field: bool = False,
     frame_shift: Seconds = 0.01,
 ) -> MonoCut:
     duration = num_samples / sampling_rate
     cut = MonoCut(
         id=str(uuid4()),
         start=0,
         duration=duration,
         channel=0,
         recording=self.with_recording(sampling_rate=sampling_rate,
                                       num_samples=num_samples),
     )
     if features:
         cut = self._with_features(cut, frame_shift=frame_shift)
     if supervision:
         cut.supervisions.append(
             SupervisionSegment(
                 id=f"sup-{cut.id}",
                 recording_id=cut.recording_id,
                 start=0,
                 duration=cut.duration,
                 text="irrelevant",
                 alignment=self._with_alignment(cut, "irrelevant")
                 if alignment else None,
             ))
     if custom_field:
         self._with_custom_temporal_array(cut=cut, frame_shift=frame_shift)
     return cut
コード例 #8
0
 def test_mixed_cut_audio_mask(self, supervisions):
     cut = MonoCut(
         "cut",
         start=0,
         duration=2,
         channel=0,
         recording=Mock(sampling_rate=16000),
         supervisions=supervisions,
     )
     mixed_cut = cut.append(cut)
     mask = mixed_cut.supervisions_audio_mask()
     ones = np.index_exp[
         list(
             chain(
                 range(0, 8000),
                 range(9600, 12800),
                 range(32000, 40000),
                 range(41600, 44800),
             )
         )
     ]
     zeros = np.index_exp[
         list(
             chain(
                 range(8000, 9600),
                 range(12800, 32000),
                 range(40000, 41600),
                 range(44800, 64000),
             )
         )
     ]
     assert (mask[ones] == 1).all()
     assert (mask[zeros] == 0).all()
コード例 #9
0
ファイル: test_masks.py プロジェクト: lhotse-speech/lhotse
 def test_cut_audio_mask(self):
     cut = MonoCut('cut',
                   start=0,
                   duration=2,
                   channel=0,
                   recording=Mock(sampling_rate=16000))
     mask = cut.supervisions_audio_mask()
     assert mask.sum() == 0
コード例 #10
0
ファイル: test_masks.py プロジェクト: lhotse-speech/lhotse
 def test_mixed_cut_features_mask(self):
     cut = MonoCut('cut',
                   start=0,
                   duration=2,
                   channel=0,
                   features=Mock(sampling_rate=16000, frame_shift=0.01))
     mixed_cut = cut.append(cut)
     mask = mixed_cut.supervisions_feature_mask()
     assert mask.sum() == 0
コード例 #11
0
def test_cut_custom_nonarray_attr_serialization():
    """Check that arbitrary custom fields work with Cuts upon (de)serialization."""
    cut = MonoCut(id="x", start=10, duration=8, channel=0, custom={"SNR": 7.3})

    data = cut.to_dict()
    restored_cut = deserialize_item(data)
    assert cut == restored_cut

    # Note: we extended cuts attributes by setting the "custom" field.
    assert restored_cut.SNR == 7.3
コード例 #12
0
 def test_cut_features_mask(self):
     cut = MonoCut(
         "cut",
         start=0,
         duration=2,
         channel=0,
         features=Mock(sampling_rate=16000, frame_shift=0.01, num_frames=2000),
     )
     mask = cut.supervisions_feature_mask()
     assert mask.sum() == 0
コード例 #13
0
def cut_with_supervision(recording):
    return MonoCut(
        id="cut",
        start=0.0,
        duration=0.5,
        channel=0,
        supervisions=[
            SupervisionSegment(id="sup", recording_id="rec", start=0.0, duration=0.5)
        ],
        recording=recording,
    )
コード例 #14
0
def cut_with_supervision_start01(recording):
    return MonoCut(
        id="cut_start01",
        start=0.1,
        duration=0.4,
        channel=0,
        supervisions=[
            SupervisionSegment(id="sup", recording_id="rec", start=0.1, duration=0.3)
        ],
        recording=recording,
    )
コード例 #15
0
def random_cut_set(n_cuts=100) -> CutSet:
    return CutSet.from_cuts(
        MonoCut(id=uuid4(),
                start=round(random.uniform(0, 5), ndigits=8),
                duration=round(random.uniform(3, 10), ndigits=8),
                channel=0,
                recording=Recording(id=uuid4(),
                                    sources=[],
                                    sampling_rate=16000,
                                    num_samples=1600000,
                                    duration=100.0)) for _ in range(n_cuts))
コード例 #16
0
def test_cut_load_array_truncate():
    """Check that loading a custom Array works after truncation."""
    ivector = np.arange(20).astype(np.float32)
    with NamedTemporaryFile(suffix=".h5") as f, LilcomHdf5Writer(f.name) as writer:
        cut = MonoCut(id="x", start=0, duration=5, channel=0)
        cut.ivector = writer.store_array(key="utt1", value=ivector)

        cut = cut.truncate(duration=3)

        restored_ivector = cut.load_ivector()
        np.testing.assert_equal(ivector, restored_ivector)
コード例 #17
0
def test_cut_trim_to_supervisions_extend_handles_end_of_recording(mono_cut):
    """
    Scenario::

        |----------Recording---------|
        |---Sup1----|       |--Sup2--|
        |------------Cut-------------|

    Into::

        |----------Recording---------|
        |---Cut1----|     |---Cut2---|
        |---Sup1----|       |--Sup2--|
    """
    cut = MonoCut(
        id="X",
        start=0.0,
        duration=10.0,
        channel=0,
        supervisions=[
            SupervisionSegment(id="X",
                               recording_id="X",
                               start=0.0,
                               duration=4.0),
            SupervisionSegment(id="X",
                               recording_id="X",
                               start=7.0,
                               duration=3.0),
        ],
        recording=Recording(id="X",
                            sources=[],
                            sampling_rate=8000,
                            num_samples=80000,
                            duration=10.0),
    )

    cuts = cut.trim_to_supervisions(min_duration=4.0)

    assert len(cuts) == 2
    c1, c2 = cuts

    assert c1.start == 0
    assert c1.duration == 4.0
    assert len(c1.supervisions) == 1
    (c1_s1, ) = c1.supervisions
    assert c1_s1.start == 0.0
    assert c1_s1.duration == 4.0

    assert c2.start == 6.5
    assert c2.duration == 3.5
    assert len(c2.supervisions) == 1
    (c2_s1, ) = c2.supervisions
    assert c2_s1.start == 0.5
    assert c2_s1.duration == 3.0
コード例 #18
0
def cut_with_supervision(recording):
    return MonoCut(id='cut',
                   start=0.0,
                   duration=0.5,
                   channel=0,
                   supervisions=[
                       SupervisionSegment(id='sup',
                                          recording_id='rec',
                                          start=0.0,
                                          duration=0.5)
                   ],
                   recording=recording)
コード例 #19
0
def test_cut_load_array():
    """Check that a custom Array attribute is successfully recognized."""
    ivector = np.arange(20).astype(np.float32)
    with NamedTemporaryFile(suffix=".h5") as f, LilcomHdf5Writer(f.name) as writer:
        manifest = writer.store_array(key="utt1", value=ivector)
        cut = MonoCut(id="x", start=0, duration=5, channel=0)
        # Note: MonoCut doesn't normally have an "ivector" attribute,
        #       and a "load_ivector()" method.
        #       We are dynamically extending it.
        cut.ivector = manifest
        restored_ivector = cut.load_ivector()
        np.testing.assert_equal(ivector, restored_ivector)
コード例 #20
0
def cut_with_supervision_start01(recording):
    return MonoCut(id='cut_start01',
                   start=0.1,
                   duration=0.4,
                   channel=0,
                   supervisions=[
                       SupervisionSegment(id='sup',
                                          recording_id='rec',
                                          start=0.1,
                                          duration=0.3)
                   ],
                   recording=recording)
コード例 #21
0
def test_cut_custom_attr_serialization():
    """Check that a custom Array attribute is successfully serialized + deserialized."""
    ivector = np.arange(20).astype(np.float32)
    with NamedTemporaryFile(suffix=".h5") as f, LilcomHdf5Writer(f.name) as writer:
        cut = MonoCut(id="x", start=0, duration=5, channel=0)
        cut.ivector = writer.store_array(key="utt1", value=ivector)

        data = cut.to_dict()
        restored_cut = deserialize_item(data)
        assert cut == restored_cut

        restored_ivector = restored_cut.load_ivector()
        np.testing.assert_equal(ivector, restored_ivector)
コード例 #22
0
def test_padding_issue_478():
    """
    https://github.com/lhotse-speech/lhotse/issues/478
    """
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(
            f.name) as writer:

        # Prepare data for cut 1.
        cut1 = MonoCut("c1",
                       start=0,
                       duration=4.9,
                       channel=0,
                       recording=dummy_recording(1))
        ali1 = np.random.randint(500, size=(121, ))
        cut1.label_alignment = writer.store_array("c1",
                                                  ali1,
                                                  frame_shift=0.04,
                                                  temporal_dim=0)

        # Prepare data for cut 2.
        cut2 = MonoCut("c2",
                       start=0,
                       duration=4.895,
                       channel=0,
                       recording=dummy_recording(2))
        ali2 = np.random.randint(500, size=(121, ))
        cut2.label_alignment = writer.store_array("c2",
                                                  ali2,
                                                  frame_shift=0.04,
                                                  temporal_dim=0)

        # Test collation behavior on this cutset.
        cuts = CutSet.from_cuts([cut1, cut2])
        label_alignments, label_alignment_lens = collate_custom_field(
            cuts, "label_alignment")

        np.testing.assert_equal(label_alignments[0].numpy(), ali1)
        np.testing.assert_equal(label_alignments[1].numpy(), ali2)
コード例 #23
0
def cut(recording):
    return MonoCut(
        id="cut",
        start=0,
        duration=1.0,
        channel=0,
        recording=recording,
        supervisions=[
            SupervisionSegment(id="sup",
                               recording_id=recording.id,
                               start=0,
                               duration=0.5)
        ],
    )
コード例 #24
0
def test_cut_load_temporal_array_truncate():
    """Check the array loaded via TemporalArray is truncated along with the cut."""
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as writer:
        expected_duration = 52.4  # 131 frames x 0.4s frame shift == 52.4s
        cut = MonoCut(id="x", start=0, duration=expected_duration, channel=0)

        alignment = np.random.randint(500, size=131)
        cut.alignment = writer.store_array(
            key="utt1", value=alignment, frame_shift=0.4, temporal_dim=0
        )
        cut_trunc = cut.truncate(duration=5.0)

        alignment_piece = cut_trunc.load_alignment()
        assert alignment_piece.shape == (13,)  # 5.0 / 0.4 == 12.5 ~= 13
        np.testing.assert_equal(alignment[:13], alignment_piece)
コード例 #25
0
def test_cut_load_temporal_array():
    """Check that we can read a TemporalArray from a cut when their durations match."""
    alignment = np.random.randint(500, size=131)
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as writer:
        manifest = writer.store_array(
            key="utt1", value=alignment, frame_shift=0.4, temporal_dim=0
        )
        expected_duration = 52.4  # 131 frames x 0.4s frame shift == 52.4s
        cut = MonoCut(id="x", start=0, duration=expected_duration, channel=0)
        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut.alignment = manifest
        restored_alignment = cut.load_alignment()
        np.testing.assert_equal(alignment, restored_alignment)
コード例 #26
0
def cut_set():
    cut = MonoCut(
        id="cut-1",
        start=0.0,
        duration=10.0,
        channel=0,
        features=Features(
            type="fbank",
            num_frames=100,
            num_features=40,
            frame_shift=0.01,
            sampling_rate=16000,
            start=0.0,
            duration=10.0,
            storage_type="lilcom",
            storage_path="irrelevant",
            storage_key="irrelevant",
        ),
        recording=Recording(
            id="rec-1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10.0,
            sources=[
                AudioSource(type="file", channels=[0], source="irrelevant")
            ],
        ),
        supervisions=[
            SupervisionSegment(id="sup-1",
                               recording_id="irrelevant",
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id="sup-2",
                               recording_id="irrelevant",
                               start=7.0,
                               duration=2.0),
        ],
    )
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id="cut-nosup", supervisions=[]),
        fastcopy(cut, id="cut-norec", recording=None),
        fastcopy(cut, id="cut-nofeat", features=None),
        cut.pad(duration=30.0, direction="left"),
        cut.pad(duration=30.0, direction="right"),
        cut.pad(duration=30.0, direction="both"),
        cut.mix(cut, offset_other_by=5.0, snr=8),
    ])
コード例 #27
0
def libri_cut_with_supervision(libri_recording_orig):
    return MonoCut(
        id="libri_cut_1",
        start=0,
        duration=libri_recording_orig.duration,
        channel=0,
        supervisions=[
            SupervisionSegment(
                id="sup",
                recording_id="rec",
                start=0,
                duration=libri_recording_orig.duration,
            )
        ],
        recording=libri_recording_orig,
    )
コード例 #28
0
ファイル: test_masks.py プロジェクト: lhotse-speech/lhotse
 def test_mixed_cut_features_mask(self, supervisions):
     cut = MonoCut('cut',
                   start=0,
                   duration=2,
                   channel=0,
                   features=Mock(sampling_rate=16000, frame_shift=0.01),
                   supervisions=supervisions)
     mixed_cut = cut.append(cut)
     mask = mixed_cut.supervisions_feature_mask()
     ones = np.index_exp[list(
         chain(range(0, 50), range(60, 80), range(200, 250),
               range(260, 280)))]
     zeros = np.index_exp[list(
         chain(range(50, 60), range(80, 200), range(250, 260),
               range(280, 400)))]
     assert (mask[ones] == 1).all()
     assert (mask[zeros] == 0).all()
コード例 #29
0
def cut_set():
    cut = MonoCut(id='cut-1',
                  start=0.0,
                  duration=10.0,
                  channel=0,
                  features=Features(
                      type='fbank',
                      num_frames=100,
                      num_features=40,
                      frame_shift=0.01,
                      sampling_rate=16000,
                      start=0.0,
                      duration=10.0,
                      storage_type='lilcom',
                      storage_path='irrelevant',
                      storage_key='irrelevant',
                  ),
                  recording=Recording(id='rec-1',
                                      sampling_rate=16000,
                                      num_samples=160000,
                                      duration=10.0,
                                      sources=[
                                          AudioSource(type='file',
                                                      channels=[0],
                                                      source='irrelevant')
                                      ]),
                  supervisions=[
                      SupervisionSegment(id='sup-1',
                                         recording_id='irrelevant',
                                         start=0.5,
                                         duration=6.0),
                      SupervisionSegment(id='sup-2',
                                         recording_id='irrelevant',
                                         start=7.0,
                                         duration=2.0)
                  ])
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id='cut-nosup', supervisions=[]),
        fastcopy(cut, id='cut-norec', recording=None),
        fastcopy(cut, id='cut-nofeat', features=None),
        cut.pad(duration=30.0, direction='left'),
        cut.pad(duration=30.0, direction='right'),
        cut.pad(duration=30.0, direction='both'),
        cut.mix(cut, offset_other_by=5.0, snr=8)
    ])
コード例 #30
0
def test_augmentation_chain_randomized(
        target_sampling_rate: int,
        sp_factor: float,
        resample_first: bool,
        cut_duration: Seconds
):
    recording = Recording.from_file('test/fixtures/libri/libri-1088-134315-0000.wav')

    if resample_first:
        recording_aug = recording.resample(target_sampling_rate).perturb_speed(sp_factor)
    else:
        recording_aug = recording.perturb_speed(sp_factor).resample(target_sampling_rate)

    audio_aug = recording_aug.load_audio()
    assert audio_aug.shape[1] == recording_aug.num_samples

    cut_aug = MonoCut(id='dummy', start=0.5125, duration=cut_duration, channel=0, recording=recording_aug)
    assert cut_aug.load_audio().shape[1] == cut_aug.num_samples